#### Importing utitlities

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.ensemble import RandomForestRegressor

#### Loading data

In [46]:
data = pd.read_csv("./csv/Johannesburg.csv")
## Data exploration
data.columns = data.columns.str.lower()
data.columns = data.columns.str.replace(' ', '_')

### Data cleaning
data['erf_size'] = data['erf_size'].str.replace('m²', '')
data['erf_size'] = data['erf_size'].str.replace(' ', '')
data['erf_size'].loc[data['erf_size'].str.contains('ha', na=False)]
# convert ha into m²
data['erf_size'] = data['erf_size'].str.replace('ha', '0000')
data['erf_size'] = pd.to_numeric(data['erf_size'], errors='coerce')

data['floor_size'] = data['floor_size'].str.replace('m²', '')
data['floor_size'] = data['floor_size'].str.replace(' ', '')
data['floor_size'] = pd.to_numeric(data['floor_size'], errors='coerce')

data['price'] = data['price'].str.replace('R', '')
data['price'] = data['price'].str.replace(' ', '')
data['price'] = pd.to_numeric(data['price'], errors='coerce')

data['levies'] = data['levies'].str.replace(' ', '')
data['levies'] = data['levies'].str.replace('R', '')
data['levies'] = pd.to_numeric(data['levies'], errors='coerce')

data['rates_and_taxes'] = data['rates_and_taxes'].str.replace('R', '')
data['rates_and_taxes'] = data['rates_and_taxes'].str.replace(' ', '')
data['rates_and_taxes'] = pd.to_numeric(data['rates_and_taxes'], errors='coerce')

### Visualisation des correlations
#### Correlation entre features et le label
data.corr()['price'].sort_values(ascending=False)
#### Correlation entre les features
data.corr()['bathrooms'].sort_values(ascending=False)
data.corr()['levies'].sort_values(ascending=False)
data.corr()['rates_and_taxes'].sort_values(ascending=False)
### Visualisation des outliers
## Préprocessing
#### Supprimer les types de proprieté Farm, Industrie, commercial, vancantLand
data = data.drop(data.loc[data['type_of_property'] == 'CommercialProperty'].index)
data = data.drop(data.loc[data['type_of_property'] == 'Farm'].index)
data = data.drop(data.loc[data['type_of_property'] == 'VacantLand/Plot'].index)
data = data.drop(data.loc[data['type_of_property'] == 'IndustrialProperty'].index)

data = data.drop('levies', axis=1)

upper_price_limit = data['price'].quantile(0.99999999)
lower_price_limit = data['price'].quantile(0.01)

# removing outlier
data = data[(data['price'] <= upper_price_limit) & (data['price'] >= lower_price_limit)]
Data = copy.copy(data)
##### Effacer les NaN dans type_of_property
Data = Data.drop(Data.loc[Data['type_of_property'].isna()].index)
Data['type_of_property'].isna().sum()
#### Remplissage des NaN dans la colonne rates_and_taxes par la valeur mediane des rates_and_taxes des houses/Appartement/Townhouse
median_taxes_house = Data.loc[Data['type_of_property'] == 'House']['rates_and_taxes'].median()
median_taxes_appartment = Data.loc[Data['type_of_property'] == 'Apartment/Flat']['rates_and_taxes'].median()
median_taxes_townHouse = Data.loc[Data['type_of_property'] == 'Townhouse']['rates_and_taxes'].median()

Data.loc[Data['type_of_property'].eq('Apartment/Flat') & Data['rates_and_taxes'].isna(), 'rates_and_taxes'] = median_taxes_appartment 
Data.loc[Data['type_of_property'].eq('Townhouse') & Data['rates_and_taxes'].isna(), 'rates_and_taxes'] = median_taxes_townHouse 
Data.loc[Data['type_of_property'].eq('House') & Data['rates_and_taxes'].isna(), 'rates_and_taxes'] = median_taxes_house 
#### Traitement des parking et pets alloweds
Data.loc[Data['parking_space'].isna(), 'parking_space'] = 0
Data.loc[Data['pets_allowed'].isna(), 'pets_allowed'] = 0
#### Tranformation des bedrooms NaN en mediane
Data.loc[Data['type_of_property'].eq('Apartment/Flat') & Data['bedrooms'].isna(), 'bedrooms'] = Data.loc[Data['type_of_property'] == 'Apartment/Flat']['bedrooms'].median()
Data.loc[Data['type_of_property'].eq('House') & Data['bedrooms'].isna(), 'bedrooms'] = Data.loc[Data['type_of_property'] == 'House']['bedrooms'].median()
Data.loc[Data['type_of_property'].eq('Townhouse') & Data['bedrooms'].isna(), 'bedrooms'] = Data.loc[Data['type_of_property'] == 'Townhouse']['bedrooms'].median()
#### Transformation des bathrooms Nan en mediane
median_bathrooms_house = Data.loc[Data['type_of_property'] == 'House']['bathrooms'].median()
median_bathrooms_appartment = Data.loc[Data['type_of_property'] == 'Apartment/Flat']['bathrooms'].median()
median_bathrooms_townhouse = Data.loc[Data['type_of_property'] == 'Townhouse']['bathrooms'].median()


Data.loc[Data['type_of_property'].eq('House') & Data['bathrooms'].isna(), 'bathrooms'] = median_bathrooms_house
Data.loc[Data['type_of_property'].eq('Apartment/Flat') & Data['bathrooms'].isna(), 'bathrooms'] = median_bathrooms_appartment
Data.loc[Data['type_of_property'].eq('Townhouse') & Data['bathrooms'].isna(), 'bathrooms'] = median_bathrooms_townhouse

Data = Data.drop(Data.loc[Data['bathrooms'].isna()].index)


Data = Data.drop(Data.loc[Data['bedrooms'].isna()].index)

#### Traitement des floor_size et erf_size
Data.loc[Data['type_of_property'].eq('Apartment/Flat') & Data['erf_size'].isna(), 'erf_size'] = Data.loc[Data['type_of_property'] == 'Apartment/Flat']['floor_size']
Data.loc[Data['type_of_property'].eq('Apartment/Flat') & Data['erf_size'].isna() & Data['floor_size'].isna() , 'erf_size'] = Data.loc[Data['type_of_property'] == 'Apartment/Flat']['erf_size'].median()
Data.loc[Data['type_of_property'].eq('House') & Data['erf_size'].isna(), 'erf_size'] = Data.loc[Data['type_of_property'] == 'House']['erf_size'].median()
Data.loc[Data['type_of_property'].eq('Townhouse') & Data['erf_size'].isna(), 'erf_size'] = Data.loc[Data['type_of_property'] == 'Townhouse']['erf_size'].median()
Data.loc[Data['type_of_property'].eq('Apartment/Flat') & Data['floor_size'].isna(), 'floor_size'] = Data.loc[Data['type_of_property'] == 'Apartment/Flat']['erf_size']
Data.loc[Data['type_of_property'].eq('Apartment/Flat') & Data['floor_size'].isna() & Data['erf_size'].isna() , 'floor_size'] = Data.loc[Data['type_of_property'] == 'Apartment/Flat']['floor_size'].median()
Data.loc[Data['type_of_property'].eq('House') & Data['floor_size'].isna(), 'floor_size'] = Data.loc[Data['type_of_property'] == 'House']['floor_size'].median()
Data.loc[Data['type_of_property'].eq('Townhouse') & Data['floor_size'].isna(), 'floor_size'] = Data.loc[Data['type_of_property'] == 'Townhouse']['floor_size'].median()
Data = Data.dropna(axis = 0)

Data.loc[Data["type_of_property"].eq("Apartment/Flat")] = 0
Data.loc[Data["type_of_property"].eq("House")] = 1
Data.loc[Data["type_of_property"].eq("Townhouse")] = 2

data = Data



#### Splitting datas

In [47]:
X = data.drop(["price"], axis=1)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,shuffle=True)

#### Creating and fitting model

In [48]:
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

1.0

#### Creating model

In [None]:
with open("./model/johannesburg_model.pkl","wb") as file:
    pickle.dump(model, file) 