## LIBRARIES

In [5]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

## IMPORT DATA

In [6]:
df_train = pd.read_csv("../data/df_train.csv", index_col = 0)
df_test = pd.read_csv("../data/df_test.csv", index_col = 0)

## MODELLING

In [8]:
X = df_train.drop("price", axis = 1)
y = df_train.price

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=66)

### LinearRegression

In [12]:
# create pipe
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", LinearRegression())
])

# Standarization & model Training
pipe.fit(X_train, y_train)

# Predicting
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

# Metrics
mse_train = mean_squared_error(y_pred_train,y_train)
mse_test = mean_squared_error(y_pred_test,y_test)

mse_train, mse_test

(0.024920785977398573, 0.01871493777941878)

### RandomForestRegression

In [13]:
# random forest regressor model

# create forest
rf = RandomForestRegressor(n_estimators=100, max_depth=None, max_features='auto')

# model training
rf.fit(X_train, y_train)

# predictions
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

# Metrics
mse_train = mean_squared_error(y_pred_train,y_train)
mse_test = mean_squared_error(y_pred_test,y_test)

mse_train, mse_test

(0.001492131681264617, 0.010313058394827532)

We can observe that the RandomForest model is better and we apply grid search

In [15]:
# define model and params

rf_model = RandomForestRegressor() 

params = {
    "n_estimators":[50, 100], 
    "max_depth":[3, 4, 5], 
    "min_samples_split":[2, 4, 8], 
    "min_samples_leaf":[2, 3, 4]
}

In [16]:
mse = make_scorer(mean_squared_error)

In [17]:
clf = GridSearchCV(estimator = rf_model, param_grid = params, scoring = mse, verbose=2)

In [18]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.9s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.3s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.3s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.7s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=2, 

[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=50; total time=   0.9s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=50; total time=   1.2s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=50; total time=   1.0s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   1.9s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=200; total time=   3.6s
[CV] END max_depth=4, min_samples_leaf=3, min_samples_split=4, n_estimators=200; total time=   3.8s
[CV

[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   3.6s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   3.6s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   3.5s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   3.6s
[CV] END max_depth=4, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   3.6s


[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=200; total time=   4.4s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=200; total time=   4.9s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=200; total time=   4.4s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=200; total time=   4.3s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=200; total time=   4.3s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=600; total time=  13.0s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=600; total time=  14.0s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=600; total time=  14.2s


[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   4.6s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   5.1s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   4.8s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   5.3s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=600; total time=  14.7s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=600; total time=  14.6s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=600; total time=  13.5s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=600; total time=  13.5s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=600; total time=  13.7s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=8, n_estimators=50; total time=   1.2s
[

[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   5.3s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=8, n_estimators=600; total time=  16.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=8, n_estimators=600; total time=  15.7s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=8, n_estimators=600; total time=  15.8s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=8, n_estimators=600; total time=  15.6s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=8, n_estimators=600; total time=  16.2s
[CV] END max_depth=6, min_samples_leaf=3, min_samples_split=2, n_estimators=50; total time=   1.8s
[CV] END max_depth=6, min_samples_leaf=3, min_samples_split=2, n_estimators=50; total time=   1.6s
[CV] END max_depth=6, min_samples_leaf=3, min_samples_split=2, n_estimators=50; total time=   1.3s
[CV] END max_depth=6, min_samples_leaf=3, min_samples_split=2, n_estimators=50; total time=   1.3s
[CV]

[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=  15.5s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=  16.2s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=  15.5s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=4, n_estimators=50; total time=   1.3s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=4, n_estimators=50; total time=   1.3s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=4, n_estimators=50; total time=   1.3s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=4, n_estimators=50; total time=   1.3s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=4, n_estimators=50; total time=   1.3s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=4, n_estimators=100; total time=   2.5s
[CV] END max_depth=6, min_samples_leaf=4, min_samples_split=4, n_estimators=100; total time=   2.5s
[CV] 

[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=600; total time=  43.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=600; total time=  42.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=50; total time=   3.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=50; total time=   3.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=50; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=50; total time=   3.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=50; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=100; total time=   6.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=100; total time=   6.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators

[CV] END max_depth=None, min_samples_leaf=3, min_samples_split=8, n_estimators=600; total time=  40.3s
[CV] END max_depth=None, min_samples_leaf=3, min_samples_split=8, n_estimators=600; total time=  37.3s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   3.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   3.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   3.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   3.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   3.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   6.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   6.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [4, 5, 6, None],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 4, 8],
                         'n_estimators': [50, 100, 200, 600]},
             scoring=make_scorer(mean_squared_error), verbose=2)

In [19]:
# What are the best parameters? 
clf.best_params_

{'max_depth': 4,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 50}

In [31]:
clf_ = RandomForestRegressor(max_depth= 4, min_samples_leaf = 3, min_samples_split = 4, n_estimators = 50)

In [32]:
best = clf_.fit(X,y)
best

RandomForestRegressor(max_depth=4, min_samples_leaf=3, min_samples_split=4,
                      n_estimators=50)

In [34]:
y_pred_final = best.predict(df_test)
y_pred_final

array([9.25064984, 7.4986828 , 8.73464219, ..., 9.25064984, 7.04829575,
       7.37915472])

## Let´s apply de final model 

In [36]:
submission_01 = pd.DataFrame(y_pred_final, columns= ['price'])
submission_01.index.name = "id"

In [41]:
submission_01.to_csv('../data/submission_01.csv', header=True)