# 0 - Modules

In [None]:
# ##################################
#
#  much of the cod is stored in 
#  my package for more readability
#
# ##################################
#from mypackage import ploter_bis as plt_bis
from mypackage import ploter
from mypackage import data_processor as dp
from mypackage import mydataloader as dl

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
print(tscv)
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from pickle import dump
import platform

## 1 - Data Preparation

## 1.1 - Load Data

In [None]:
! mkdir data
    
# ########## Chose the periode

#years = [2020,2019,2018,2017,2016,2015,2014]
#years = [2020]
years = [2020,2019,2018,2017]


# ########## Chose the area with departement code

#departements = ['75'] 
departements = ['75','92','93','94','77','78','91','95'] 
departements = ['75','92','93','94','95']


data = dl.get_market_data(years = years,departements=departements,top_cities=None)

## 1.2 - Features engineering with special encoding

In [None]:
X_train,X_test,y_train,y_test = dp.feature_engineering(data)

In [None]:
X_train.head()

# 2 - Experimentation

## 2.1 - Helper function for Model Selection
> Based on Time Series Cross Validation

In [None]:
def model_selection(models,X,y,verbose=True):
    i=1
    model_score = []
    for name,model in models.items():
        scores = cross_val_score(model, X,y, cv=tscv)
        if verbose:
            print(f"{name} | score : {scores.sum()/5}")
        model_score.append((scores.sum()/5,model))
        i+=1
    best_model = sorted(model_score, reverse=True)[0][1]
    print('\n######## Best Model ########\n\t%s'%str(best_model))
    return best_model

## 2.2 - Model preparation

In [None]:
ols = linear_model.LinearRegression()
ridge = linear_model.Ridge(alpha=.1)
lasso = linear_model.Lasso(alpha=0.1)
bayesian_ridge = linear_model.BayesianRidge()
svr = svm.SVR() # >== scale very badly
rf = RandomForestRegressor(max_depth=10,min_samples_leaf=10, random_state=0)
gbreg = GradientBoostingRegressor()

## 2.3 - Cross validation with Initial Numerical Features

In [None]:
models = {'OLS regression' : ols,
          'Ridge regression' : ridge,
          'Lasso regression' : lasso,
          'Bayesian Ridge regression' : bayesian_ridge
          }


features_basic = ['surface','pieces','terrain']

model = model_selection(models,X_train[features_basic],y_train)

model.fit(X_train[features_basic],y_train)

In [None]:
ploter.check_model_performances(X_train[features_basic],y_train,model,show=True)
ploter.check_model_performances(X_test[features_basic],y_test,model,True)

## 2.4 - Cross validation with Encoded Categorical Features

In [None]:
models = {'OLS regression' : ols,
          'Ridge regression' : ridge,
          'Lasso regression' : lasso,
          'Bayesian Ridge regression' : bayesian_ridge,
          'RandomForestRegressor' : rf,
          'Gradient Boosting Regressor':gbreg
          }

### 2.4.1 - Cross validation with 'encodage_voie' as a single feature

In [None]:
features_augmented = ['encodage_voie']


model = model_selection(models,X_train[features_augmented],y_train)
model.fit(X_train[features_augmented],y_train)

In [None]:
ploter.check_model_performances(X_train[features_augmented],y_train,model,show=True)
ploter.check_model_performances(X_test[features_augmented],y_test,model,True)

### 2.4.2 - Cross validation with all encoded features

In [None]:
features_augmented = ['surface','pieces','encodage_voie','encodage_piece',
                     'encodage_type_voie','encodage_ville','encodage_departement',
                      'terrain','id_local']

model = model_selection(models,X_train[features_augmented],y_train)
model.fit(X_train[features_augmented],y_train)

In [None]:
ploter.check_model_performances(X_train[features_augmented],y_train,model,show=True)
ploter.check_model_performances(X_test[features_augmented],y_test,model,True)

### 2.4.3 - Cross validation with 5 encoded features
#### These features will be used for ease of future model deployment.
* surface
* pieces
* encodage_voie
* terrain
* id_local

In [None]:
features_augmented = ['surface','pieces','encodage_voie',
                      'terrain','id_local']

model = model_selection(models,X_train[features_augmented],y_train)
model.fit(X_train[features_augmented],y_train)

# 3 - Feature importance

In [None]:
try :
    feat_importances = pd.Series(model.feature_importances_, index=features_augmented)
except :    
    feat_importances = pd.Series(model.coef_, index=features_augmented)
feat_importances.plot(kind='barh')

In [None]:
D = X_train[features_augmented]
D['y'] = y_train

sns.pairplot(D,vars=D.columns[:-1])
plt.show()

# 4 - Save the model

In [None]:
# #### not working with docker and my configuration

#dump(model, open('model.pkl', 'wb'))

# 4 - Investigate errors on Train Set 

In [None]:
train_info = X_train[['Commune','surface','pieces']]
predictions_train = model.predict(X_train[features_augmented])

final_predictions_train = pd.concat([train_info.reset_index(),y_train.reset_index().drop(columns=['index']),pd.Series(predictions_train.reshape(-1))],axis=1)\
.drop(columns='index')


final_predictions_train.columns = ['commune','surface','pieces','target','pred']

final_predictions_train[['target','pred']] = np.exp(final_predictions_train[['target','pred']])

final_predictions_train['error'] = abs(final_predictions_train.target-final_predictions_train.pred)

final_predictions_train.sort_values(by=['error'],ascending=False).tail(10000)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

D = final_predictions_train
D['y'] = y_train

#sns.pairplot(D,vars=D.columns[:-1], hue="y")
sns.pairplot(D,vars=D.columns)
plt.show()

## 4.1 - Mean error by City

In [None]:
errors_city = final_predictions_train.groupby(['commune'])\
                       .agg({'error':['min','mean','max','std']})\
                       .reset_index()

errors_city.columns=['commune','error_min','error_mean','error_max','std_error']
errors_city=errors_city.sort_values(by=['error_mean'])
errors_city['inv_error_mean']=errors_city['error_mean'].apply(lambda v : 1/v)
errors_city['inv_std_error']=errors_city['std_error'].apply(lambda v : 1/v)

### 4.1.1 - City with high mean errors

In [None]:
errors_city[['commune','error_mean']].tail(10)
ploter.show_cloud(errors_city)

### 4.1.2 - City with low mean errors

In [None]:
errors_city[['commune','error_mean']].head(10)
ploter.show_cloud(errors_city,'inv_error_mean')

## 4.2 -  Standard deviation of error by City

### 4.2.1 - City with high std errors

In [None]:
ploter.show_cloud(errors_city.dropna(),'std_error')

### 4.2.2 - City with low std errors

In [None]:
ploter.show_cloud(errors_city.dropna(),'inv_std_error')

# 5 - Conclusion

* The DVF dataset is useful for predicting future prices, as long as proper data preprocessing is applied.
* The encoding technique I applied definitely improves the performance on the Test set, even with a single encoded data. Nonetheless, the model is overfiting.
* Grid search should be applied to help improve model performance. But I didn't cover this part because the best model could change, depending on the area selected. 
* Adding features specific to each home (the condition and practicality of the property) should help improve the model predictive power.
* The more we move away from Paris the less the model makes errors.
* It seems like the dynamic of the real estate market of Paris and nearby suburbs differs from that of distant suburbs. This needs to be investigated.


# 6 - Next step : Model deployment with Flask, Docker...