In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('diamonds.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop(df.loc[df['x'] == 0].index)
df = df.drop(df.loc[df['y'] == 0].index)
df = df.drop(df.loc[df['z'] == 0].index)

In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
len(df[(df['x']==0) | (df['y']==0) | (df['z']==0)])

0

## One Hot Encoding

In [6]:
cut = pd.get_dummies(df['cut'])
color = pd.get_dummies(df['color'])
clarity = pd.get_dummies(df['clarity'])
df = df.drop(['cut','color','clarity'],axis=1)
df = pd.concat([cut,color,clarity,df],axis=1)

pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Fair,Good,Ideal,Premium,Very Good,D,E,F,G,H,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2,carat,depth,table,price,x,y,z
0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0.31,63.3,58.0,335,4.34,4.35,2.75


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [8]:
data = df.drop(['price'],axis =1)
target = df['price']

In [9]:
def calc_train_error(X_train, y_train, model):
#     '''return in-sample error for already fit model.'''    
    predictions = model.predict(X_train)
    mae_train_score = mean_absolute_error(y_train, predictions)
    mse_train_score = mean_squared_error(y_train, predictions)
    rmse_train_score = np.sqrt(mse_train_score)
    r2_train_score = np.sqrt(mean_squared_error(y_train, predictions))
    accuracy = (model.score(X_train,y_train)*100)
    return {'mae': mae_train_score,
            'mse': mse_train_score,
            'rmse': rmse_train_score,
            'r2_score': r2_train_score,
            'accuracy': accuracy}

def calc_validation_error(X_test, y_test, model):
#     '''return out-sample error for already fit model.'''      
    predictions = model.predict(X_test)
    mae_test_score = mean_absolute_error(y_test, predictions)
    mse_test_score = mean_squared_error(y_test, predictions)
    rmse_test_score = np.sqrt(mse_test_score)
    r2_test_score = np.sqrt(mean_squared_error(y_test, predictions))
    accuracy = (model.score(X_test,y_test)*100)
    return {'mae': mae_test_score,
            'mse': mse_test_score,
            'rmse': rmse_test_score,
            'r2_score': r2_test_score,
            'accuracy': accuracy}

def calc_metrics(X_train, y_train, X_test, y_test, model):
#     '''fit model and return the RMSE for in-sample error and out-sample error'''     
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [10]:
from sklearn.model_selection import KFold

K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

In [11]:
train_errors = []
validation_errors = []
for train_index, val_index in kf.split(data):
    
    #split data
    X_train, X_val = data.iloc[train_index], data.iloc[val_index]
    y_train, y_val = target.iloc[train_index], target.iloc[val_index]
    
#     print(len(X_val), (len(X_train)+ len(X_val)))
    
    #instantiate model
    model = RandomForestRegressor(n_estimators = 10, random_state = 42)
    
    #calculate errors 
    train_error, val_error = calc_metrics(X_train, y_train, X_val, y_val, model)
    
    #append to appropriate list
    train_errors.append(train_error)
    validation_errors.append(val_error)

In [12]:
for i, tr_err, val_err in zip(range(1,6,1),train_errors, validation_errors) :
    print('mae Train ke ' + str(i) + ' : '+ str(tr_err['mae']))
    print('mae Validation ke ' + str(i) + ' : '+ str(val_err['mae']))
    print('\n')
    print('mse Train ke ' + str(i) + ' : '+ str(tr_err['mse']))
    print('mse Validation ke ' + str(i) + ' : '+ str(val_err['mse']))
    print('\n')
    print('rmse Train ke ' + str(i) + ' : '+ str(tr_err['rmse']))
    print('rmse Validation ke ' + str(i) + ' : '+ str(val_err['rmse']))
    print('\n')
    print('r2 score Train ke ' + str(i) + ' : '+ str(tr_err['r2_score']))
    print('r2 score Validation ke ' + str(i) + ' : '+ str(val_err['r2_score']))
    print('\n')
    print('Accuracy Train ke ' + str(i) + ' : '+ str(tr_err['accuracy']))
    print('Accuracy Validation ke ' + str(i) + ' : '+ str(val_err['accuracy']))
    print('\n')

mae Train ke 1 : 113.07268904218773
mae Validation ke 1 : 277.1469012072559


mse Train ke 1 : 59054.808242966574
mse Validation ke 1 : 301149.6887566698


rmse Train ke 1 : 243.01195082334237
rmse Validation ke 1 : 548.7710713555059


r2 score Train ke 1 : 243.01195082334237
r2 score Validation ke 1 : 548.7710713555059


Accuracy Train ke 1 : 99.6277561923595
Accuracy Validation ke 1 : 98.12156554580802


mae Train ke 2 : 112.92148463994278
mae Validation ke 2 : 282.6906184647449


mse Train ke 2 : 57777.74062429524
mse Validation ke 2 : 332249.1652198809


rmse Train ke 2 : 240.370007747005
rmse Validation ke 2 : 576.4105873592893


r2 score Train ke 2 : 240.370007747005
r2 score Validation ke 2 : 576.4105873592893


Accuracy Train ke 2 : 99.63917111795648
Accuracy Validation ke 2 : 97.84766269461922


mae Train ke 3 : 112.71348576742149
mae Validation ke 3 : 285.5956766196835


mse Train ke 3 : 57118.21435877231
mse Validation ke 3 : 335357.2865594326


rmse Train ke 3 : 238.9941722