# Diamonds Prediction

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [68]:
df = pd.read_csv("../../dataset/diamonds.csv")
df

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [69]:
df.drop(columns = ["Unnamed: 0"], inplace = True)
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
df_train.shape

(37758, 10)

In [70]:
df_train

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
19497,1.21,Ideal,H,VVS2,61.3,57.0,8131,6.92,6.87,4.23
31229,0.31,Ideal,E,VS2,62.0,56.0,756,4.38,4.36,2.71
22311,1.21,Ideal,E,VS1,62.4,57.0,10351,6.75,6.83,4.24
278,0.81,Ideal,F,SI2,62.6,55.0,2795,5.92,5.96,3.72
6646,0.79,Ideal,I,VVS2,61.7,56.0,4092,5.94,5.95,3.67
...,...,...,...,...,...,...,...,...,...,...
11284,1.05,Very Good,I,VS2,62.4,59.0,4975,6.48,6.51,4.05
44732,0.47,Ideal,D,VS1,61.0,55.0,1617,5.03,5.01,3.06
38158,0.33,Very Good,F,IF,60.3,58.0,1014,4.49,4.46,2.70
860,0.90,Premium,J,SI1,62.8,59.0,2871,6.13,6.03,3.82


In [71]:
df_train = df_train.replace('?', np.nan)
df_test = df_test.replace('?', np.nan)
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

In [72]:
df.dropna(inplace = True)
df_train.dropna(inplace = True)
df_test.dropna(inplace = True)

In [73]:
df_train.isna().sum()
df_train.dropna(inplace = True)
df_train.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [74]:
df_test.isna().sum()
df_test.dropna(inplace = True)
df_test.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [75]:
categorical_train = df_train.select_dtypes(exclude = np.number).columns
categorical_train

Index(['cut', 'color', 'clarity'], dtype='object')

In [76]:
classes = dict()
le = LabelEncoder()
for col in categorical_train:
    df_train[col]= le.fit_transform(df_train[col])
    classes[col] = le.classes_

In [77]:
categorical_test = df_test.select_dtypes(exclude = np.number).columns
categorical_test

Index(['cut', 'color', 'clarity'], dtype='object')

In [78]:
classes = dict()
le = LabelEncoder()
for col in categorical_test:
    df_test[col]= le.fit_transform(df_test[col])
    classes[col] = le.classes_

In [79]:
df_train.dropna(inplace = True)
df_test.dropna(inplace = True)

In [80]:
x_train, y_train = df_train.drop(columns = 'price').values, df_train['price'].values
x_test, y_test = df_test.drop(columns = 'price').values, df_test['price'].values

In [81]:
reg = LinearRegression() # ols
reg.fit(x_train, y_train)

In [82]:
y_pred = reg.predict(x_test)

In [83]:
y_pred[:10]

array([  361.44079213,  3357.10580122,  2143.19154701,  2373.59136124,
       10549.76912504,  5688.926464  ,  1900.42363938,  2615.21437595,
        3074.83860193,  5817.44704998])

In [84]:
y_test[:10]

array([ 559, 2201, 1238, 1304, 6901, 3011, 1765, 1679, 2102, 4789],
      dtype=int64)

In [85]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred)

(0.8865849100292423, 1767318.9516595902, 855.083682915477)

In [86]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(x_train, y_train)
y_pred_ridge = ridge_reg.predict(x_test)

In [87]:
mean_squared_error(y_test, y_pred_ridge)

1767507.17324492

In [88]:
mean_absolute_error(y_test, y_pred_ridge)

855.5108362064632

In [89]:
r2_score(y_test, y_pred_ridge)


0.8865728311863065