In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import pyplot
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [2]:
diamonds = pd.read_csv("./INPUT/diamonds-datamad1019/data.csv/data.csv")

In [3]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,2.26,Ideal,G,SI2,61.9,57.0,8.44,8.36,5.2,12831
1,2.43,Very Good,H,SI2,63.2,57.0,8.56,8.5,5.39,16170
2,0.8,Premium,F,SI2,61.0,57.0,6.03,6.01,3.67,2797
3,0.4,Ideal,F,I1,63.3,60.0,4.68,4.64,2.95,630
4,0.31,Ideal,G,VS2,61.6,55.0,4.39,4.37,2.7,698


-------------------------------------------------------------------------------------------------------------------

In [4]:
cond = (diamonds['y'] == 0) | (diamonds['x'] == 0) | (diamonds['z'] == 0) |(diamonds['y'] >= 20) | (diamonds['z'] >= 20) 
diamonds.drop(diamonds[cond].index, inplace = True)

In [5]:
columns = ['carat','depth', 'table', 'x', 'y', 'z']
for col in columns:
    diamonds[col] = (diamonds[col] - np.mean(diamonds[col])) / np.std(diamonds[col]) 
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,3.083462,Ideal,G,SI2,0.106054,-0.205696,2.41565,2.359838,2.398669,12831
1,3.442035,Very Good,H,SI2,1.013885,-0.205696,2.522714,2.485665,2.673215,16170
2,0.003954,Premium,F,SI2,-0.522444,-0.205696,0.26543,0.247757,0.187847,2797
3,-0.839746,Ideal,F,I1,1.083718,1.139229,-0.939049,-0.983542,-0.852539,630
4,-1.029579,Ideal,G,VS2,-0.103445,-1.102313,-1.197789,-1.226207,-1.213785,698


Create a 'ratio', which apparently affects diamonds price

In [6]:
#diamonds["ratio"] = diamonds.x / diamonds.y

Drop x & z, since we already have ratio and keep y, since it could give further information to the logarithm

In [7]:
#diamonds.drop(["x", "z", "y"], inplace =True, axis = 1)

Save 'price' in a new DF

In [8]:
d_price = pd.DataFrame(diamonds['price'])
d_price.shape

(40437, 1)

In [9]:
diamonds.drop(["price"], inplace=True , axis = 1)

----------------------------------------------------------------------------------------------------------------

Make the categorical columns numerical, following the order described in Kaggle - Diamonds - 1

In [10]:
cut_c = ('Fair', 'Good', 'Very Good', 'Premium', 'Ideal')
color_c = ('J','I','H','G','F','E','D')
clarity_c = ('I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF')

diamonds.clarity = pd.Categorical(diamonds.clarity, ordered = True, categories = clarity_c)
diamonds.cut = pd.Categorical(diamonds.cut, ordered = True, categories = cut_c)
diamonds.color = pd.Categorical(diamonds.color, ordered = True, categories = color_c)

diamonds['clarity_c'] = diamonds.clarity.cat.codes
diamonds['cut_c'] = diamonds.cut.cat.codes
diamonds['color_c']= diamonds.color.cat.codes

In [11]:
diamonds.drop(columns=['clarity','cut','color'], inplace = True)

In [12]:
diamonds.head()

Unnamed: 0,carat,depth,table,x,y,z,clarity_c,cut_c,color_c
0,3.083462,0.106054,-0.205696,2.41565,2.359838,2.398669,1,4,3
1,3.442035,1.013885,-0.205696,2.522714,2.485665,2.673215,1,2,2
2,0.003954,-0.522444,-0.205696,0.26543,0.247757,0.187847,1,3,4
3,-0.839746,1.083718,1.139229,-0.939049,-0.983542,-0.852539,0,4,4
4,-1.029579,-0.103445,-1.102313,-1.197789,-1.226207,-1.213785,3,4,3


------------------------------------------------------------------------------------------------------------------

## Fit, train, transform REPEAT

In [13]:
X = diamonds
y = d_price.price

In [14]:
print(X.shape, y.shape)

(40437, 9) (40437,)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

## Modelling

In [16]:

models = {
    #"kNeRe": KNeighborsRegressor(6),
    #"deciTree": DecisionTreeRegressor(random_state=0),
    #"forestRe": RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, min_samples_split=6, n_jobs = -1, bootstrap= True, max_depth= 90),
    "forestRePlus": RandomForestRegressor(n_jobs = -1, bootstrap=True, max_depth=110, min_samples_leaf=2, min_samples_split=8, n_estimators=500),
    #"graBoostingRe": GradientBoostingRegressor(),
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)
    model.score(X_test, y_test)
    pred = model.predict(X_test)
    r2 = r2_score(y_test, pred)
    print(f'The R2 for the model {modelName} is: {r2}')


Training model: forestRePlus
The R2 for the model forestRePlus is: 0.9810820237075215


In [None]:
'''
p_test = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features':["auto"],
    'min_samples_leaf': [2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [500, 1000]
}
'''

In [None]:
'''
tuning=GridSearchCV(estimator=RandomForestRegressor(), param_grid=p_test, n_jobs = -1, cv = 5, verbose =5)
tuning.fit(X_train,y_train)
tuning.score(X_test, y_test)
pred= tuning.predict(X_test)
r2 = r2_score(y_test, pred)
r2
'''

In [None]:
#0.9838505949980909

In [None]:
#0.9816115781595326

-------------------------------------------------------------------------------------------------------------------

## Testing

In [27]:
test = pd.read_csv("./INPUT/diamonds-datamad1019/test.csv", index_col='id')

In [28]:
columns = ['carat','depth', 'table', 'x', 'y', 'z']
for col in columns:
    test[col] = (test[col] - np.mean(test[col])) / np.std(test[col]) 
test.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,-0.500241,Ideal,I,VS2,0.241489,-1.537773,-0.383443,-0.360777,-0.340759
1,0.535739,Ideal,G,VS2,0.101972,-0.646693,0.744052,0.71177,0.740676
2,-0.627096,Premium,E,VS2,-0.177064,-0.646693,-0.553462,-0.595116,-0.585884
3,0.007178,Ideal,F,VS1,0.729801,-0.646693,0.162408,0.207042,0.279264
4,-0.542526,Ideal,G,VS1,-0.246823,-0.201152,-0.446082,-0.477947,-0.48495


In [29]:
#test.drop(["x", "z", "y"], inplace =True, axis = 1)

In [30]:
cut_c = ('Fair', 'Good', 'Very Good', 'Premium', 'Ideal')
color_c = ('J','I','H','G','F','E','D')
clarity_c = ('I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF')

test.clarity = pd.Categorical(test.clarity, ordered = True, categories = clarity_c)
test.cut = pd.Categorical(test.cut, ordered = True, categories = cut_c)
test.color = pd.Categorical(test.color, ordered = True, categories = color_c)

test['clarity_c'] = test.clarity.cat.codes
test['cut_c'] = test.cut.cat.codes
test['color_c']= test.color.cat.codes

In [31]:
test.drop(columns=['clarity','cut','color'], inplace = True)

In [32]:
test.head()

Unnamed: 0_level_0,carat,depth,table,x,y,z,clarity_c,cut_c,color_c
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,-0.500241,0.241489,-1.537773,-0.383443,-0.360777,-0.340759,3,4,1
1,0.535739,0.101972,-0.646693,0.744052,0.71177,0.740676,3,4,3
2,-0.627096,-0.177064,-0.646693,-0.553462,-0.595116,-0.585884,3,3,5
3,0.007178,0.729801,-0.646693,0.162408,0.207042,0.279264,4,4,4
4,-0.542526,-0.246823,-0.201152,-0.446082,-0.477947,-0.48495,4,4,3


In [33]:
for modelName, model in models.items():
    pr = {'price':model.predict(test)}
    df1 =pd.DataFrame(pr)

In [None]:
pr = {'price':tuning.predict(test)}
df1 = pd.DataFrame(pr)

In [34]:
df1.index.rename('id', inplace=True)

In [35]:
df1.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,1286.139199
1,6651.924691
2,1707.183133
3,3985.302647
4,1761.649483


In [36]:
df1.to_csv("./OUTPUT/test7.csv")