In [31]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np

In [32]:
# Loads the AMES housing Dataset
housing = fetch_openml(name="house_prices", as_frame=True)  # noqa

In [33]:
data = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['target'])

In [34]:
liste_to_numeric = ['YrSold', 'YearBuilt', 'GrLivArea', 'LotFrontage', 'LotArea','GarageArea']
data[liste_to_numeric] = data[liste_to_numeric].apply(pd.to_numeric)

Data cleaning

In [35]:
# create dumies
data['Fence'] = data['Fence'].notna()
data['Pool'] = data['PoolArea'] > 0

# create the new variable 'age'
data['Age'] = data.apply(lambda x: x['YrSold']-x['YearBuilt'] if (x['YearBuilt']<x['YearRemodAdd']) 
                                                           else (x['YrSold']-x['YearRemodAdd']), axis=1)

# Delete all data with MSZoning = commercial, agriculture and industrial as these are not residential units
data = data[(data.MSZoning != 'C (all)') & (data.MSZoning != 'I (all)') & (data.MSZoning != 'A (agr)')]

# Delete abnormal sales
data = data[(data.SaleCondition != 'Abnorml')] 
data = data[(data.SaleCondition != 'Family')] 


In [36]:
# Select Features
X = data[['Age','GrLivArea', 'LotFrontage', 'LotArea', 'GarageArea', 'Fence', 'Pool']]
Y = data[['target']]

In [37]:
X.to_csv('clean_X.csv', index=False)

In [38]:
X.head()

Unnamed: 0,Age,GrLivArea,LotFrontage,LotArea,GarageArea,Fence,Pool
0,5.0,1710.0,65.0,8450.0,548.0,False,False
1,31.0,1262.0,80.0,9600.0,460.0,False,False
2,7.0,1786.0,68.0,11250.0,608.0,False,False
4,8.0,2198.0,84.0,14260.0,836.0,False,False
5,16.0,1362.0,85.0,14115.0,480.0,True,False


In [39]:
Y.head()

Unnamed: 0,target
0,208500.0
1,181500.0
2,223500.0
4,250000.0
5,143000.0


In [40]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=.3, random_state=1121218)

In [41]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [42]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [43]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [44]:
randomf = RandomForestRegressor()

randomf_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', randomf)
])

In [55]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__n_estimators': [50,100,150,200], 'model__bootstrap': [True,False]}

search = GridSearchCV(randomf_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

_ = search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))

print('Best params:', search.best_params_)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best score: 24080.44994291379
Best params: {'model__bootstrap': True, 'model__n_estimators': 50}


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [46]:
type(search.best_estimator_.steps[1])

tuple

In [47]:
print(search.best_estimator_.steps[1][1].feature_importances_)

[1.94278759e-01 4.39721051e-01 4.48009353e-02 6.18579375e-02
 2.56149155e-01 1.47395375e-03 1.31170925e-03 3.43420002e-04
 6.30785796e-05]


In [48]:
type(print(5))

5


NoneType

In [49]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(search, open(filename, 'wb'))

In [50]:
mean_absolute_error(y_train, search.predict(X_train))

8928.300736997908

In [51]:
mean_absolute_error(y_valid, search.predict(X_valid))

22288.045723192023

In [52]:
train_data = pd.concat([X_train, y_train], axis=1)

In [53]:
train_data.head()

Unnamed: 0,Age,GrLivArea,LotFrontage,LotArea,GarageArea,Fence,Pool,target
1229,48.0,1507.0,70.0,7910.0,404.0,True,False,127000.0
600,1.0,1904.0,74.0,10927.0,736.0,False,False,275000.0
473,1.0,1976.0,110.0,14977.0,908.0,False,False,440000.0
639,0.0,1567.0,53.0,3982.0,648.0,False,False,264561.0
1347,1.0,1776.0,93.0,15306.0,712.0,False,False,283463.0


In [54]:
import pycaret

ModuleNotFoundError: No module named 'pycaret'

In [None]:
from pycaret.utils import version
version()