In [1]:
#import the libraries:
import pandas as pd 
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import BayesianRidge 
from sklearn.linear_model import ElasticNet

# Models:

In [2]:
# import the clean datasets
#%store -r dmd_cl1
#%store -r dmd_cl2
%store -r dmd_cl3
#%store -r dmd_cl4

In [4]:
X = dmd_cl3.drop(columns = 'price')
y = dmd_cl3['price']

#### Lets divide the dataset into train and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 1. LINEAR REGRESSION

In [8]:
#Train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [9]:
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

In [10]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lr ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lr ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_lr ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lr )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lr ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lr ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_lr))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lr )))

TRAIN : Mean Absolute Error: 840.0937760558036
TRAIN : Mean Squared Error: 1468633.5427640094
TRAIN : R2 Score: 0.908313879429201
TRAIN : Root Mean Squared Error: 1211.871916814648
----------------------------------------------------------
TEST : Mean Absolute Error: 849.842319911223
TEST : Mean Squared Error: 1536115.3017356512
TEST : R2 Score: 0.905827684126531
TEST : Root Mean Squared Error: 1239.4011867574


#### Cross Validation:

In [11]:
scores_lr = cross_val_score(lr, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_lr

array([-1238.61413376, -1197.65068083, -1199.52372361])

## 2. LASSO

In [12]:
lasso = Lasso()
lasso.fit(X_train, y_train)

Lasso()

In [13]:
y_pred_train_lasso = lasso.predict(X_train)
y_pred_test_lasso = lasso.predict(X_test)

In [14]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lasso ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lasso ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_lasso ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lasso )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lasso ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lasso ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_lasso))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lasso )))

TRAIN : Mean Absolute Error: 839.6819084719341
TRAIN : Mean Squared Error: 1468641.7595684424
TRAIN : R2 Score: 0.9083133664578573
TRAIN : Root Mean Squared Error: 1211.8753069389782
----------------------------------------------------------
TEST : Mean Absolute Error: 849.4333350174829
TEST : Mean Squared Error: 1536278.1289934907
TEST : R2 Score: 0.9058177019201558
TEST : Root Mean Squared Error: 1239.466872890716


#### Cross Validation:

In [15]:
scores_lasso = cross_val_score(lasso, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_lasso

array([-1238.61084476, -1197.63085032, -1199.55610937])

## 3. RIDGE

In [16]:
ridge = Ridge(alpha = 1)
ridge.fit(X_train, y_train)

Ridge(alpha=1)

In [17]:
y_pred_train_ridge = ridge.predict(X_train)
y_pred_test_ridge = ridge.predict(X_test)

In [18]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_ridge ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_ridge ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_ridge ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_ridge )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_ridge ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_ridge ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_ridge))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_ridge )))

TRAIN : Mean Absolute Error: 840.0684250313367
TRAIN : Mean Squared Error: 1468633.5716681674
TRAIN : R2 Score: 0.9083138776247277
TRAIN : Root Mean Squared Error: 1211.8719287400659
----------------------------------------------------------
TEST : Mean Absolute Error: 849.8189611966802
TEST : Mean Squared Error: 1536122.2671692513
TEST : R2 Score: 0.9058272571071448
TEST : Root Mean Squared Error: 1239.403996753783


#### Cross validation : 

In [19]:
scores_ridge = cross_val_score(lasso, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_ridge

array([-1238.61084476, -1197.63085032, -1199.55610937])

## 4. DECISSION TREE

In [20]:
tree = DecisionTreeClassifier(max_depth = 3)
tree.fit(X_train, y_train)
y_pred_train_tree = tree.predict(X_train)
y_pred_test_tree = tree.predict(X_test)

In [21]:
from sklearn import metrics
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_tree))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_tree))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_tree ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_tree)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_tree ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_tree ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_tree ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_tree )))

TRAIN : Mean Absolute Error: 3197.7442528735633
TRAIN : Mean Squared Error: 25781150.642349523
TRAIN : R2 Score: -0.6095054466750771
TRAIN : Root Mean Squared Error: 5077.514218822978
----------------------------------------------------------
TEST : Mean Absolute Error: 3217.8739340007414
TEST : Mean Squared Error: 26207376.826103076
TRAIN : R2 Score: -0.6066563270961598
TEST : Root Mean Squared Error: 5119.314097230515


#### Cross validation:

In [22]:
scores_tree = cross_val_score(tree, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_tree



array([-5118.60398967, -5053.8087188 , -5060.02403421])

## 5. RANDOM FOREST

In [23]:
randforest = RandomForestRegressor(n_estimators=20, random_state=10)
randforest.fit(X_train, y_train)
y_pred_train_randforest = randforest.predict(X_train)
y_pred_test_randforest = randforest.predict(X_test)

In [24]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_randforest))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_randforest))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_randforest ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_randforest)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_randforest ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_randforest ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_randforest ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_randforest )))

TRAIN : Mean Absolute Error: 117.12403937029957
TRAIN : Mean Squared Error: 52104.98567449762
TRAIN : R2 Score: 0.9967471095683266
TRAIN : Root Mean Squared Error: 228.26516526727775
----------------------------------------------------------
TEST : Mean Absolute Error: 291.1828631025281
TEST : Mean Squared Error: 316928.6148796818
TRAIN : R2 Score: 0.9805705329566943
TEST : Root Mean Squared Error: 562.9641328536675


#### Cross validation:

In [25]:
scores_randforest = cross_val_score(randforest, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_randforest

array([-589.24480233, -566.95978741, -581.47593902])

## 6. K-NEAREST NEIGHBOR

In [26]:
neiclass = KNeighborsClassifier(n_neighbors=5)
neiclass.fit(X_train, y_train)

KNeighborsClassifier()

In [27]:
y_pred_train_neiclass = neiclass.predict(X_train)
y_pred_test_neiclass = neiclass.predict(X_test)

In [28]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_neiclass))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_neiclass))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_neiclass ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_neiclass)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_neiclass ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_neiclass ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_neiclass ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_neiclass )))

TRAIN : Mean Absolute Error: 780.130175503646
TRAIN : Mean Squared Error: 1948727.1169509331
TRAIN : R2 Score: 0.8783418571061067
TRAIN : Root Mean Squared Error: 1395.9681647340433
----------------------------------------------------------
TEST : Mean Absolute Error: 947.2981090100111
TEST : Mean Squared Error: 2493769.2956371275
TRAIN : R2 Score: 0.847118227675391
TEST : Root Mean Squared Error: 1579.1672791813814


#### Cross validation:

In [29]:
scores_neighclass = cross_val_score(neiclass, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_neighclass



array([-1684.32544902, -1629.24545883, -1635.46588583])

## 7. BAYESIAN RIDGE

In [30]:
bay = BayesianRidge() 
bay.fit(X_train, y_train) 

BayesianRidge()

In [31]:
y_pred_train_bay = bay.predict(X_train)
y_pred_test_bay = bay.predict(X_test)

In [32]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_bay))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_bay))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_bay ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_bay)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_bay  ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_bay  ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_bay ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_bay  )))

TRAIN : Mean Absolute Error: 840.0819498783176
TRAIN : Mean Squared Error: 1468633.5490519234
TRAIN : R2 Score: 0.9083138790366495
TRAIN : Root Mean Squared Error: 1211.8719194089463
----------------------------------------------------------
TEST : Mean Absolute Error: 849.8314249918858
TEST : Mean Squared Error: 1536118.5431753665
TRAIN : R2 Score: 0.9058274854084502
TEST : Root Mean Squared Error: 1239.4024944203422


#### Cross validation:

In [33]:
scores_bay = cross_val_score(bay, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_bay

array([-1238.61266401, -1197.65077453, -1199.52516072])

## 8. ELASTIC NET

In [34]:
elnet = ElasticNet(random_state=0)
elnet.fit(X_train, y_train)

ElasticNet(random_state=0)

In [35]:
y_pred_train_elnet = elnet.predict(X_train)
y_pred_test_elnet = elnet.predict(X_test)

In [36]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_elnet))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_elnet))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_elnet ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_elnet)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_elnet))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_elnet))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_elnet ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_elnet)))

TRAIN : Mean Absolute Error: 1293.7728638320589
TRAIN : Mean Squared Error: 3682524.109297868
TRAIN : R2 Score: 0.770101703618646
TRAIN : Root Mean Squared Error: 1918.9903880160182
----------------------------------------------------------
TEST : Mean Absolute Error: 1317.4766352458994
TEST : Mean Squared Error: 3853165.1123562744
TRAIN : R2 Score: 0.7637797881035038
TEST : Root Mean Squared Error: 1962.9480666477843


#### Cross validation:

In [37]:
scores_elnet = cross_val_score(elnet, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_elnet

array([-1948.96537436, -1905.40019551, -1903.52576474])

## Modify train_size

In [38]:
X_train2, X_test2, y_train2, y_test2 = train_test_split( X, y, train_size=0.9)

## 1.1 LINEAR REGRESSION

In [39]:
#We will try it with the model with the lower rmse, in this case : linear regression

In [40]:
#Train the model
lr_2 = LinearRegression()
lr_2.fit(X_train2, y_train2)

LinearRegression()

In [41]:
y_pred_train_lr_2 = lr_2.predict(X_train2)
y_pred_test_lr_2 = lr_2.predict(X_test2)

In [42]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train2, y_pred_train_lr_2  ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train2, y_pred_train_lr_2  ))
print('TRAIN : R2 Score:', r2_score(y_train2, y_pred_train_lr_2  ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train2, y_pred_train_lr_2  )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test2, y_pred_test_lr_2 ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test2, y_pred_test_lr_2 ))
print('TEST : R2 Score:', r2_score(y_test2, y_pred_test_lr_2))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test2, y_pred_test_lr_2 )))

TRAIN : Mean Absolute Error: 843.464458884492
TRAIN : Mean Squared Error: 1467581.0102942032
TRAIN : R2 Score: 0.9088113723881722
TRAIN : Root Mean Squared Error: 1211.4375800239166
----------------------------------------------------------
TEST : Mean Absolute Error: 845.6628231059516
TEST : Mean Squared Error: 1612677.8726508885
TEST : R2 Score: 0.8987236888837004
TEST : Root Mean Squared Error: 1269.9125452765984


#### Cross validation:

In [43]:
scores_lr_2 = cross_val_score(lr_2, X_train2, y_train2, cv=3, scoring='neg_root_mean_squared_error')
scores_lr_2

array([-1181.92206622, -1221.40415956, -1232.07307878])

# Lets introduce the second dataset

In [44]:
#import the dataset:
dmd_pred = pd.read_csv("../resources/predict.csv",index_col="id")

In [45]:
dmd_pred.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.45,Premium,E,SI1,62.8,58.0,4.88,4.84,3.05
1,1.23,Ideal,H,SI1,61.0,56.0,6.96,6.92,4.23
2,0.33,Ideal,I,IF,61.8,55.0,4.46,4.47,2.76
3,0.51,Premium,D,VS2,58.0,60.0,5.29,5.26,3.06
4,0.4,Premium,E,VS2,62.2,59.0,4.71,4.74,2.94


In [46]:
# Same steps as with the dataset above:

In [47]:
dmd_pred = dmd_pred.drop(columns = ['x','y','z'])

In [48]:
dmd_pred["cut"].replace({"Ideal": 5, "Premium": 4, "Very Good" : 3, "Good" : 2, "Fair" : 1}, inplace=True)
dmd_pred["color"].replace({"D": 7, "E": 6, "F" : 5, "G" : 4, "H" : 3, "I": 2 , "J": 1}, inplace=True)
dmd_pred["clarity"].replace({"IF": 9, "VVS1": 8, "VVS2" : 7, 
                             "VS1" : 6, "VS2" : 5, "SI1": 4, 
                             "SI2" : 2, "I1": 1 }, inplace=True)

In [49]:
dmd_pred.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.45,4,6,4,62.8,58.0
1,1.23,5,3,4,61.0,56.0
2,0.33,5,2,9,61.8,55.0
3,0.51,4,7,5,58.0,60.0
4,0.4,4,6,5,62.2,59.0


In [52]:
y_pred_predict_test = lr_2 .predict(dmd_pred)

In [53]:
y_pred_predict_test

array([ 905.25119541, 7116.35146334, 1146.81679163, ..., 1645.79019377,
       -479.73666509, 4922.00233806])

In [54]:
dmd_pred['price'] = y_pred_predict_test
dmd_pred.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.45,4,6,4,62.8,58.0,905.251195
1,1.23,5,3,4,61.0,56.0,7116.351463
2,0.33,5,2,9,61.8,55.0,1146.816792
3,0.51,4,7,5,58.0,60.0,2386.512924
4,0.4,4,6,5,62.2,59.0,927.662883


In [55]:
#export

In [56]:
dmd_pred["price"].to_csv("final3.csv",header= True )