In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [12]:
train = pd.read_csv("output/train_clean2.csv", index_col=0)
predict = pd.read_csv("output/predict_clean.csv", index_col=0)

In [13]:
X = train.drop(columns='price')
y = train.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(32364, 5)
(32364,)
(8091, 5)
(8091,)


In [14]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor()
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("MSE", round(mean_squared_error(y_test,y_pred)),2)
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")

-------ExtraTreesRegressor-------
MSE 1.0 2
ExtraTrees Accuracy -0.11792282496474549


In [None]:
params = {"n_estimators":[150,250,300],
        "max_depth":[10,25,50],
        "min_samples_split":[5,10,20]
       }

efr = ExtraTreesRegressor()
grid = GridSearchCV(efr,scoring="neg_root_mean_squared_error",verbose=1,n_jobs=-1,param_grid=params,cv=5)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.5s


In [6]:
grid.best_params_

{'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 300}

In [7]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=300, min_samples_split=10, max_depth= 25) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("MSE", round(mean_squared_error(y_test,y_pred)),2)
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")


-------ExtraTreesRegressor-------
MSE 0.0 2
ExtraTrees Accuracy 0.9916450351444748


In [8]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=350, min_samples_split=10, max_depth= 25) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("MSE", round(mean_squared_error(y_test,y_pred)),2)
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")


-------ExtraTreesRegressor-------
MSE 0.0 2
ExtraTrees Accuracy 0.9916726980290047


In [9]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=1000, min_samples_split=10, max_depth= 50) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("MSE", round(mean_squared_error(y_test,y_pred)),2)
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")


-------ExtraTreesRegressor-------
MSE 0.0 2
ExtraTrees Accuracy 0.991668601797955


In [10]:
print(f"-------ExtraTreesRegressor-------")
model = ExtraTreesRegressor(n_estimators=1800, min_samples_split=10, max_depth= 50) 
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("MSE", round(mean_squared_error(y_test,y_pred)),2)
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"ExtraTrees Accuracy {np.mean(scores)}")


-------ExtraTreesRegressor-------
MSE 0.0 2
ExtraTrees Accuracy 0.9916705674483908
