# Cleaning data 4 and models


In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


In [53]:
train_dummy=pd.read_csv("../input/train_dummy.csv")

In [54]:
predict_dummy=pd.read_csv("../input/predict_dummy.csv")
predict=pd.read_csv("../input/predict.csv", index_col="id")

In [5]:
# Drp x, y, z as they have high collinearity with carat

train_dropped_xyz=train_dummy.drop(["x","y","z"], axis=1)

In [6]:
train_dropped_xyz

Unnamed: 0,id,carat,depth,table,price,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.50,62.3,55.0,1845,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1.54,63.6,60.0,10164,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,2,1.32,61.7,60.0,5513,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
3,3,1.20,62.1,55.0,5174,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,4,1.73,61.2,60.0,10957,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.40,62.9,58.0,687,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
40451,40451,0.95,62.9,58.0,3984,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
40452,40452,0.63,61.2,56.0,2182,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
40453,40453,1.22,63.8,55.0,7201,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


# Random Forest

In [8]:
# Separate in train and test

y=train_dropped_xyz["price"]
X=train_dropped_xyz.drop(columns=["price"])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [22]:
def check_model(X_train, y_train, model, X_test, y_test):
    model.fit(X_train,y_train)
    y_pred_train=model.predict(X_train)
    y_pred_test=model.predict(X_test)
    
    r2_sc_train=r2_score(y_train, y_pred_train)
    r2_sc_test=r2_score(y_test, y_pred_test)
    rmse_train=(mean_squared_error(y_train, y_pred_train))**0.5
    rmse_test=(mean_squared_error(y_test, y_pred_test))**0.5
    
    return f"r2_score of the train data: {r2_sc_train}, r2_score of the test data: {r2_sc_test}, rmse of the train data: {rmse_train}, rmse of the test data: {rmse_test}"

In [10]:
ranf = RandomForestRegressor()
check_model(X_train, y_train, ranf, X_test, y_test)

'r2_score of the train data: 0.9963896364664596, r2_score of the test data: 0.975639507376954, rmse of the train data: 240.2969172940696, rmse of the test data: 630.6454591018805'

In [3]:
predict=pd.read_csv("../input/predict.csv", index_col="id")
train=pd.read_csv("../input/train.csv", index_col="id")

In [4]:
# Change categorical data to numerical
train

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.50,Ideal,D,VS2,62.3,55.0,5.11,5.07,3.17,1845
1,1.54,Good,I,VS1,63.6,60.0,7.30,7.33,4.65,10164
2,1.32,Very Good,J,SI2,61.7,60.0,6.95,7.01,4.31,5513
3,1.20,Ideal,I,SI1,62.1,55.0,6.83,6.79,4.23,5174
4,1.73,Premium,I,SI1,61.2,60.0,7.67,7.65,4.69,10957
...,...,...,...,...,...,...,...,...,...,...
40450,0.40,Very Good,F,SI1,62.9,58.0,4.69,4.72,2.96,687
40451,0.95,Premium,H,SI1,62.9,58.0,6.26,6.21,3.92,3984
40452,0.63,Ideal,F,VS2,61.2,56.0,5.56,5.51,3.39,2182
40453,1.22,Good,H,VS2,63.8,55.0,6.77,6.71,4.30,7201


# Cleaning data 5

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
label_encoder = LabelEncoder()

In [7]:
x=train["cut"]
y=label_encoder.fit_transform(x)

In [8]:
y

array([2, 1, 4, ..., 2, 1, 3])

In [9]:
train["cut"]=y

In [11]:
x=train["color"]
y=label_encoder.fit_transform(x)

In [12]:
train["color"]=y

In [14]:
x=train["clarity"]
y=label_encoder.fit_transform(x)

In [15]:
train["clarity"]=y

In [16]:
train

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.50,2,0,5,62.3,55.0,5.11,5.07,3.17,1845
1,1.54,1,5,4,63.6,60.0,7.30,7.33,4.65,10164
2,1.32,4,6,3,61.7,60.0,6.95,7.01,4.31,5513
3,1.20,2,5,2,62.1,55.0,6.83,6.79,4.23,5174
4,1.73,3,5,2,61.2,60.0,7.67,7.65,4.69,10957
...,...,...,...,...,...,...,...,...,...,...
40450,0.40,4,2,2,62.9,58.0,4.69,4.72,2.96,687
40451,0.95,3,4,2,62.9,58.0,6.26,6.21,3.92,3984
40452,0.63,2,2,5,61.2,56.0,5.56,5.51,3.39,2182
40453,1.22,1,4,5,63.8,55.0,6.77,6.71,4.30,7201


In [17]:
train_dropped_yz=train.drop(["z","y"], axis=1)

In [18]:
train_dropped_yz

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.50,2,0,5,62.3,55.0,5.11,1845
1,1.54,1,5,4,63.6,60.0,7.30,10164
2,1.32,4,6,3,61.7,60.0,6.95,5513
3,1.20,2,5,2,62.1,55.0,6.83,5174
4,1.73,3,5,2,61.2,60.0,7.67,10957
...,...,...,...,...,...,...,...,...
40450,0.40,4,2,2,62.9,58.0,4.69,687
40451,0.95,3,4,2,62.9,58.0,6.26,3984
40452,0.63,2,2,5,61.2,56.0,5.56,2182
40453,1.22,1,4,5,63.8,55.0,6.77,7201


In [None]:
# Try Random Forest

In [20]:
y=train_dropped_yz["price"]
X=train_dropped_yz.drop(columns=["price"])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [23]:
ranf = RandomForestRegressor()
check_model(X_train, y_train, ranf, X_test, y_test)

'r2_score of the train data: 0.9970904500058066, r2_score of the test data: 0.9798754968409421, rmse of the train data: 216.17382864049372, rmse of the test data: 569.6289531955084'

In [50]:
scoresRF= cross_val_score(ranf, X_train, y_train, cv=3, scoring="neg_root_mean_squared_error")
scoresRF

array([-590.48877791, -561.46503279, -605.45934523])

### Test 5 for Kaggle

In [55]:
x=predict["cut"]
y=label_encoder.fit_transform(x)

In [56]:
predict["cut"]=y

In [57]:
x=predict["color"]
y=label_encoder.fit_transform(x)

In [58]:
predict["color"]=y

In [59]:
x=predict["clarity"]
y=label_encoder.fit_transform(x)

In [60]:
predict["clarity"]=y

In [61]:
predict

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.45,3,1,2,62.8,58.0,4.88,4.84,3.05
1,1.23,2,4,2,61.0,56.0,6.96,6.92,4.23
2,0.33,2,5,1,61.8,55.0,4.46,4.47,2.76
3,0.51,3,0,5,58.0,60.0,5.29,5.26,3.06
4,0.40,3,1,5,62.2,59.0,4.71,4.74,2.94
...,...,...,...,...,...,...,...,...,...
13480,0.30,4,5,2,62.2,57.0,4.25,4.27,2.65
13481,1.01,2,6,5,62.3,55.0,6.40,6.44,4.00
13482,0.33,2,2,6,61.9,55.0,4.43,4.46,2.75
13483,0.30,3,3,5,59.3,59.0,4.42,4.38,2.61


In [62]:
# Drop column y, z in predict

predict_dropped_yz=predict.drop(["y", "z"], axis=1)
predict_dropped_yz

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.45,3,1,2,62.8,58.0,4.88
1,1.23,2,4,2,61.0,56.0,6.96
2,0.33,2,5,1,61.8,55.0,4.46
3,0.51,3,0,5,58.0,60.0,5.29
4,0.40,3,1,5,62.2,59.0,4.71
...,...,...,...,...,...,...,...
13480,0.30,4,5,2,62.2,57.0,4.25
13481,1.01,2,6,5,62.3,55.0,6.40
13482,0.33,2,2,6,61.9,55.0,4.43
13483,0.30,3,3,5,59.3,59.0,4.42


In [63]:
X_test_pred=predict_dropped_yz

In [64]:
y_pred_predict=ranf.predict(X_test_pred)
y_pred_predict

array([1008.38  , 6512.53  ,  761.09  , ...,  932.9865,  600.37  ,
       4505.24  ])

In [66]:
predict_dropped_yz["price"]=y_pred_predict
predict_dropped_yz

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.45,3,1,2,62.8,58.0,4.88,1008.380000
1,1.23,2,4,2,61.0,56.0,6.96,6512.530000
2,0.33,2,5,1,61.8,55.0,4.46,761.090000
3,0.51,3,0,5,58.0,60.0,5.29,1820.850000
4,0.40,3,1,5,62.2,59.0,4.71,976.100000
...,...,...,...,...,...,...,...,...
13480,0.30,4,5,2,62.2,57.0,4.25,432.380000
13481,1.01,2,6,5,62.3,55.0,6.40,4316.556667
13482,0.33,2,2,6,61.9,55.0,4.43,932.986500
13483,0.30,3,3,5,59.3,59.0,4.42,600.370000


In [67]:
predict["price"].to_csv("../output/test5.csv")