In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts 
from sklearn import metrics
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')

In [3]:
fory = pd.read_csv('data/train.csv')
data = pd.read_csv('data/train_clean.csv')
data_test = pd.read_csv('data/test_clean.csv')
data_test.head()

Unnamed: 0.1,Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0,1.0,2,4,4,56.3,64.0,6.58,6.54,3.69
1,1,1,0.83,4,1,5,62.3,58.0,6.01,5.97,3.73
2,2,2,1.0,2,5,4,67.0,53.0,6.19,6.13,4.13
3,3,3,1.0,2,2,4,66.5,62.0,6.19,6.1,4.09
4,4,4,1.2,3,6,5,62.6,57.0,6.74,6.77,4.23


In [4]:
ide = data_test.id

In [5]:
data_test.drop(columns=['Unnamed: 0'],inplace=True)
data_test.drop(columns=['id'],inplace=True)
data_test.drop(columns=['x','y','z'],inplace=True)

In [6]:
X = pd.DataFrame(data[['carat','cut','color','clarity','depth','table']])
y = fory.price

In [7]:
X_train, X_test,y_train, y_test = tts(X,y, test_size = 0.2)

In [8]:
models={
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
}

In [9]:
for name , model in models.items():
     print("ENTRENANDO: ", name)
     model.fit(X_train, y_train)

ENTRENANDO:  ridge
ENTRENANDO:  lasso
ENTRENANDO:  sgd
ENTRENANDO:  knn
ENTRENANDO:  grad


In [10]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('MSE - ', metrics.mean_squared_error(y_test, y_pred))
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R2 - ', metrics.r2_score(y_test, y_pred))

------ridge------
MSE -  2220264.351298352
RMSE -  1490.0551504217392
R2 -  0.8640026987065691
------lasso------
MSE -  2220493.037045098
RMSE -  1490.1318857890055
R2 -  0.863988691075278
------sgd------
MSE -  8938342434017331.0
RMSE -  94542807.4155688
R2 -  -547498069.8274978
------knn------
MSE -  3858716.503201088
RMSE -  1964.3616019463138
R2 -  0.7636429956708092
------grad------
MSE -  629904.3391766073
RMSE -  793.6651303771681
R2 -  0.9614166258396457


In [11]:
results = []
for depth in range(1,21):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train,y_train)
    
    result = {
        "model": model,
        "depth": depth,
        "train_error": metrics.mean_squared_error(y_train, model.predict(X_train)),
        "test_error": metrics.mean_squared_error(y_test, model.predict(X_test))
    }
    results.append(result)

In [12]:
tree = pd.DataFrame(results)

In [13]:
#tree

In [14]:
forest = RandomForestRegressor()

In [15]:
forest.fit(X_train,y_train)

RandomForestRegressor()

In [16]:
y_pred = forest.predict(X_test)

In [17]:
metrics.mean_squared_error(y_train,forest.predict(X_train)).round(3), metrics.mean_squared_error(y_test,y_pred)

(49749.072, 329892.98858166375)

In [18]:
parameters = {'bootstrap': [True, False],
 'max_depth': [10, 20],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 4],
 'min_samples_split': [2, 5],
 'n_estimators': [200, 400]}

In [19]:
params = {'max_depth': [10, 100],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 3],
 'n_estimators': [100, 150, 200]}

In [20]:
rfr = RandomForestRegressor()

In [21]:
grid = GridSearchCV(rfr, parameters, verbose=1)

In [45]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'bootstrap': [True, False], 'max_depth': [10, 20],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [2, 4],
                         'min_samples_split': [2, 5],
                         'n_estimators': [200, 400]},
             verbose=1)

In [23]:
grid1 = GridSearchCV(rfr, params, verbose=1)

In [32]:
grid1.fit(X_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [10, 100],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 3],
                         'n_estimators': [100, 150, 200]},
             verbose=1)

In [48]:
train_results1 = []
test_results1 = []
train_pred = grid1.predict(X_train)
RMSE  = np.sqrt(metrics.mean_squared_error(y_train, train_pred))
train_results1.append(RMSE)
y_pred_final = grid1.predict(X_test)
RMSE  = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
test_results1.append(RMSE)

In [49]:
test_results1, train_results1

([574.3631156173451], [321.8777306047489])

In [34]:
train_results, test_results

([321.8777306047489], [574.3631156173451])

In [35]:
test = pd.DataFrame(data_test)
test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.0,2,4,4,56.3,64.0
1,0.83,4,1,5,62.3,58.0
2,1.0,2,5,4,67.0,53.0
3,1.0,2,2,4,66.5,62.0
4,1.2,3,6,5,62.6,57.0


In [36]:
precio = grid1.predict(test)

In [47]:
train_pred1 = grid.predict(X_train)
RMSE  = np.sqrt(metrics.mean_squared_error(y_train, train_pred1))
y_pred_final1 = grid.predict(X_test)
RMSE  = np.sqrt(metrics.mean_squared_error(y_test, y_pred_final1))

In [46]:
precio2 = grid.predict(test)

In [37]:
precio

array([3523.1451746 , 2855.44786905, 3101.50653571, ..., 7570.19807143,
        655.21990476, 1066.56383333])

In [40]:
submission = pd.DataFrame({'price':precio}, index = ide)

In [44]:
submission.to_csv('data/submission1.csv')

In [50]:
submission2 = pd.DataFrame({'price':precio2}, index = ide)

In [51]:
submission2.to_csv('data/submission2.csv')