In [1]:
#import the libraries:
import pandas as pd 
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import BayesianRidge 
from sklearn.linear_model import ElasticNet

# Models:

In [2]:
# import the clean datasets
#%store -r dmd_cl1
#%store -r dmd_cl2
#%store -r dmd_cl3
%store -r dmd_cl4

In [3]:
X = dmd_cl4.drop(columns = 'price')
y = dmd_cl4['price']

#### Lets divide the dataset into train and test

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 1. LINEAR REGRESSION

In [7]:
#Train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [8]:
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

In [9]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lr ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lr ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_lr ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lr )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lr ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lr ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_lr))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lr )))

TRAIN : Mean Absolute Error: 852.175473463381
TRAIN : Mean Squared Error: 2020033.1170800016
TRAIN : R2 Score: 0.8754149549255629
TRAIN : Root Mean Squared Error: 1421.2786908555274
----------------------------------------------------------
TEST : Mean Absolute Error: 858.7397857037337
TEST : Mean Squared Error: 1999352.5996393466
TEST : R2 Score: 0.8711985488190204
TEST : Root Mean Squared Error: 1413.9846532545346


#### Cross Validation:

In [10]:
scores_lr = cross_val_score(lr, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_lr

array([-1442.05080027, -1445.3175045 , -1401.89774985])

## 2. LASSO

In [11]:
lasso = Lasso()
lasso.fit(X_train, y_train)

Lasso()

In [12]:
y_pred_train_lasso = lasso.predict(X_train)
y_pred_test_lasso = lasso.predict(X_test)

In [13]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lasso ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lasso ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_lasso ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lasso )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lasso ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lasso ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_lasso))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lasso )))

TRAIN : Mean Absolute Error: 851.1621299495292
TRAIN : Mean Squared Error: 2020061.6385663033
TRAIN : R2 Score: 0.8754131958699184
TRAIN : Root Mean Squared Error: 1421.288724561728
----------------------------------------------------------
TEST : Mean Absolute Error: 857.6992960515477
TEST : Mean Squared Error: 1999458.8843523515
TEST : R2 Score: 0.8711917017899992
TEST : Root Mean Squared Error: 1414.0222361590893


#### Cross Validation:

In [14]:
scores_lasso = cross_val_score(lasso, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_lasso

array([-1441.90787406, -1444.55661406, -1402.10349619])

## 3. RIDGE

In [15]:
ridge = Ridge(alpha = 1)
ridge.fit(X_train, y_train)

Ridge(alpha=1)

In [16]:
y_pred_train_ridge = ridge.predict(X_train)
y_pred_test_ridge = ridge.predict(X_test)

In [17]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_ridge ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_ridge ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_ridge ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_ridge )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_ridge ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_ridge ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_ridge))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_ridge )))

TRAIN : Mean Absolute Error: 852.2266726565718
TRAIN : Mean Squared Error: 2020033.3193652825
TRAIN : R2 Score: 0.875414942449668
TRAIN : Root Mean Squared Error: 1421.2787620186557
----------------------------------------------------------
TEST : Mean Absolute Error: 858.7869142659338
TEST : Mean Squared Error: 1999335.654705376
TEST : R2 Score: 0.8711996404384206
TEST : Root Mean Squared Error: 1413.9786613331107


#### Cross validation : 

In [18]:
scores_ridge = cross_val_score(lasso, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_ridge

array([-1441.90787406, -1444.55661406, -1402.10349619])

## 4. DECISSION TREE

In [19]:
tree = DecisionTreeClassifier(max_depth = 3)
tree.fit(X_train, y_train)
y_pred_train_tree = tree.predict(X_train)
y_pred_test_tree = tree.predict(X_test)

In [20]:
from sklearn import metrics
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_tree))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_tree))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_tree ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_tree)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_tree ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_tree ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_tree ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_tree )))

TRAIN : Mean Absolute Error: 3212.193054010629
TRAIN : Mean Squared Error: 26103056.43628723
TRAIN : R2 Score: -0.6098995779813383
TRAIN : Root Mean Squared Error: 5109.115034552191
----------------------------------------------------------
TEST : Mean Absolute Error: 3136.0723025583984
TEST : Mean Squared Error: 24909945.566555433
TRAIN : R2 Score: -0.6047380228931623
TEST : Root Mean Squared Error: 4990.9864322151225


#### Cross validation:

In [21]:
scores_tree = cross_val_score(tree, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_tree



array([-5075.22130942, -5147.86461424, -5104.33704787])

## 5. RANDOM FOREST

In [22]:
randforest = RandomForestRegressor(n_estimators=20, random_state=10)
randforest.fit(X_train, y_train)
y_pred_train_randforest = randforest.predict(X_train)
y_pred_test_randforest = randforest.predict(X_test)

In [23]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_randforest))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_randforest))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_randforest ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_randforest)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_randforest ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_randforest ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_randforest ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_randforest )))

TRAIN : Mean Absolute Error: 111.08137222742548
TRAIN : Mean Squared Error: 54261.03761457345
TRAIN : R2 Score: 0.9966534638665878
TRAIN : Root Mean Squared Error: 232.93998715242827
----------------------------------------------------------
TEST : Mean Absolute Error: 278.26770749098057
TEST : Mean Squared Error: 313399.29289488617
TRAIN : R2 Score: 0.9798103227358518
TEST : Root Mean Squared Error: 559.8207685455106


#### Cross validation:

In [24]:
scores_randforest = cross_val_score(randforest, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_randforest

array([-613.82782283, -631.86960727, -563.20517979])

## 6. K-NEAREST NEIGHBOR

In [25]:
neiclass = KNeighborsClassifier(n_neighbors=5)
neiclass.fit(X_train, y_train)

KNeighborsClassifier()

In [26]:
y_pred_train_neiclass = neiclass.predict(X_train)
y_pred_test_neiclass = neiclass.predict(X_test)

In [27]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_neiclass))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_neiclass))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_neiclass ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_neiclass)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_neiclass ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_neiclass ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_neiclass ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_neiclass )))

TRAIN : Mean Absolute Error: 439.0473674453096
TRAIN : Mean Squared Error: 1042632.5156037572
TRAIN : R2 Score: 0.935695896342365
TRAIN : Root Mean Squared Error: 1021.0937839413955
----------------------------------------------------------
TEST : Mean Absolute Error: 537.8214065010505
TEST : Mean Squared Error: 1206978.6863181312
TRAIN : R2 Score: 0.922244527368344
TEST : Root Mean Squared Error: 1098.6258172454036


#### Cross validation:

In [28]:
scores_neighclass = cross_val_score(neiclass, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_neighclass



array([-1150.17631824, -1196.42245583, -1222.37656518])

## 7. BAYESIAN RIDGE

In [29]:
bay = BayesianRidge() 
bay.fit(X_train, y_train) 

BayesianRidge()

In [30]:
y_pred_train_bay = bay.predict(X_train)
y_pred_test_bay = bay.predict(X_test)

In [31]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_bay))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_bay))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_bay ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_bay)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_bay  ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_bay  ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_bay ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_bay  )))

TRAIN : Mean Absolute Error: 852.2258191551457
TRAIN : Mean Squared Error: 2020033.3126902536
TRAIN : R2 Score: 0.8754149428613487
TRAIN : Root Mean Squared Error: 1421.2787596704081
----------------------------------------------------------
TEST : Mean Absolute Error: 858.7861301337499
TEST : Mean Squared Error: 1999335.9341976594
TRAIN : R2 Score: 0.8711996224330864
TEST : Root Mean Squared Error: 1413.9787601649678


#### Cross validation:

In [32]:
scores_bay = cross_val_score(bay, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_bay

array([-1442.02895715, -1445.38019441, -1401.8730691 ])

## 8. ELASTIC NET

In [33]:
elnet = ElasticNet(random_state=0)
elnet.fit(X_train, y_train)

ElasticNet(random_state=0)

In [34]:
y_pred_train_elnet = elnet.predict(X_train)
y_pred_test_elnet = elnet.predict(X_test)

In [35]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_elnet))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_elnet))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_elnet ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_elnet)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_elnet))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_elnet))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_elnet ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_elnet)))

TRAIN : Mean Absolute Error: 1141.8443303094045
TRAIN : Mean Squared Error: 3092193.11700413
TRAIN : R2 Score: 0.8092897509434411
TRAIN : Root Mean Squared Error: 1758.4632828137555
----------------------------------------------------------
TEST : Mean Absolute Error: 1118.4281928144233
TEST : Mean Squared Error: 2914613.8666154593
TRAIN : R2 Score: 0.812235972924438
TEST : Root Mean Squared Error: 1707.2240235585543


#### Cross validation:

In [36]:
scores_elnet = cross_val_score(elnet, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_elnet

array([-1775.01483506, -1787.33692765, -1716.47542902])

## Modify train_size

In [37]:
X_train2, X_test2, y_train2, y_test2 = train_test_split( X, y, train_size=0.9)

## 6.1 K-NEAREST NEIGHBOR

In [38]:
#We will try it with the model with the lower rmse, in this case : k-nearest neighbor

In [39]:
#Train the model
neiclass_2 = KNeighborsClassifier(n_neighbors=5)
neiclass_2.fit(X_train2, y_train2)

KNeighborsClassifier()

In [40]:
y_pred_train_neiclass_2 = neiclass_2.predict(X_train2)
y_pred_test_neiclass_2 = neiclass_2.predict(X_test2)

In [41]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train2, y_pred_train_neiclass_2))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train2, y_pred_train_neiclass_2))
print('TRAIN : R2 Score:', r2_score(y_train2, y_pred_train_neiclass_2 ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train2, y_pred_train_neiclass_2)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test2, y_pred_test_neiclass_2 ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test2, y_pred_test_neiclass_2 ))
print('TRAIN : R2 Score:', r2_score(y_test2, y_pred_test_neiclass_2 ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test2, y_pred_test_neiclass_2 )))

TRAIN : Mean Absolute Error: 409.67735999340823
TRAIN : Mean Squared Error: 879964.4396989755
TRAIN : R2 Score: 0.9452823841873197
TRAIN : Root Mean Squared Error: 938.0641980690743
----------------------------------------------------------
TEST : Mean Absolute Error: 508.405091448344
TEST : Mean Squared Error: 1084641.5563519525
TRAIN : R2 Score: 0.9323386201650913
TEST : Root Mean Squared Error: 1041.4612601301849


#### Cross validation:

In [42]:
scores_neighclass_2 = cross_val_score(neiclass_2, X_train2, y_train2, cv=3, scoring='neg_root_mean_squared_error')
scores_neighclass_2



array([-1110.27714994, -1123.4014509 , -1154.90835392])

# Lets introduce the second dataset

In [43]:
#import the dataset:
dmd_pred = pd.read_csv("../resources/predict.csv",index_col="id")

In [44]:
# Same steps as with the dataset above:

In [45]:
drop1_lc = dmd_pred.drop(columns=['depth'])
drop1_lc.head()

Unnamed: 0_level_0,carat,cut,color,clarity,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.45,Premium,E,SI1,58.0,4.88,4.84,3.05
1,1.23,Ideal,H,SI1,56.0,6.96,6.92,4.23
2,0.33,Ideal,I,IF,55.0,4.46,4.47,2.76
3,0.51,Premium,D,VS2,60.0,5.29,5.26,3.06
4,0.4,Premium,E,VS2,59.0,4.71,4.74,2.94


In [46]:
drop1_lc['cut_weight']=drop1_lc['cut']/drop1_lc['carat']
drop1_lc['color_weight']=drop1_lc['color']/drop1_lc['carat']
drop1_lc['clarity_weight']=drop1_lc['clarity']/drop1_lc['carat']
dmd_cl4 = drop1_lc.drop(['cut','color','clarity','depth','table'], axis=1)

TypeError: unsupported operand type(s) for /: 'str' and 'float'

In [55]:
y_pred_predict_test = neiclass_2 .predict(dmd_pred)

In [48]:
y_pred_predict_test

In [47]:
dmd_pred['price'] = y_pred_predict_test
dmd_pred.head()

In [58]:
#export

In [59]:
dmd_pred["price"].to_csv("final4.csv",header= True )