In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

In [3]:
#loading the data
data = pd.read_csv('DiamondsPrices.csv')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
data['volume'] = data['x'] * data['y'] * data['z']
data = data.drop(columns= ['x','y','z'])
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,volume
0,0.23,Ideal,E,SI2,61.5,55.0,326,38.20203
1,0.21,Premium,E,SI1,59.8,61.0,326,34.505856
2,0.23,Good,E,VS1,56.9,65.0,327,38.076885
3,0.29,Premium,I,VS2,62.4,58.0,334,46.72458
4,0.31,Good,J,SI2,63.3,58.0,335,51.91725


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   volume   53940 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 3.3+ MB


In [6]:
#treating duplicates
#dropping the duplicate values
data.drop_duplicates(keep='first', inplace=True)
data.shape

(53794, 8)

In [7]:
#treating the null values
for column in data:
    print(("Number of zeros in {} is {}".format(column,(data[column]==0).sum())))

Number of zeros in carat is 0
Number of zeros in cut is 0
Number of zeros in color is 0
Number of zeros in clarity is 0
Number of zeros in depth is 0
Number of zeros in table is 0
Number of zeros in price is 0
Number of zeros in volume is 19


In [8]:
#dropping all zero values from x,y and z columns
data = data[(data[['volume']] != 0).all(axis=1)]
data.shape

(53775, 8)

In [9]:
#calculating the quantiles q1, q3
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)


#calculating the IQR
IQR = q3-q1

#calculating the upper limit
u_l = q3+(1.5*IQR)

#CALCULATING THE lower_limit
l_l = q1-(1.5*IQR)

print(u_l)
print(l_l)

#dropping the upper outliers
data = data[~((data > u_l).any(axis=1) | (data < l_l).any(axis=1))]
data.shape

carat         2.000000
depth        64.750000
table        63.500000
price     11883.500000
volume      329.276638
dtype: float64
carat       -0.560000
depth       58.750000
table       51.500000
price    -5608.500000
volume     -93.222606
dtype: float64


(47408, 8)

In [10]:
#calculating the quantiles q1, q3
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
print(Q1)
print(Q3)

carat       0.38000
depth      61.10000
table      56.00000
price     891.00000
volume     61.75494
Name: 0.25, dtype: float64
carat        1.010000
depth       62.500000
table       59.000000
price     4637.000000
volume     163.428804
Name: 0.75, dtype: float64


In [11]:
#calculating the IQR
iqr = Q3-Q1
iqr

carat        0.630000
depth        1.400000
table        3.000000
price     3746.000000
volume     101.673864
dtype: float64

In [12]:
#calculating the upper limit
u = Q3+(1.5*iqr)

#CALCULATING THE lower_limit
l = Q1-(1.5*iqr)

print(u)
print(l)

carat         1.9550
depth        64.6000
table        63.5000
price     10256.0000
volume      315.9396
dtype: float64
carat       -0.565000
depth       59.000000
table       51.500000
price    -4728.000000
volume     -90.755856
dtype: float64


In [13]:
#dropping the upper outliers
data = data[~((data > u).any(axis=1) | (data < l).any(axis=1))]
data.shape

(45756, 8)

In [14]:
#calculating the quantiles q1, q3
Q1_1 = data.quantile(0.25)
Q3_3 = data.quantile(0.75)
print(Q1_1)
print(Q3_3)

carat       0.370000
depth      61.200000
table      56.000000
price     876.000000
volume     60.507992
Name: 0.25, dtype: float64
carat        1.0100
depth       62.5000
table       59.0000
price     4416.0000
volume     161.4912
Name: 0.75, dtype: float64


In [15]:
#calculating the IQR
iqr_1 = Q3_3-Q1_1
iqr_1

carat        0.640000
depth        1.300000
table        3.000000
price     3540.000000
volume     100.983208
dtype: float64

In [16]:
#calculating the upper limit
ul = Q3_3+(1.5*iqr_1)
ll = Q1_1-(1.5*iqr_1)

In [17]:
#count of upper quantiles
print(data[data<ll].count())
print(data[data>ul].count())

carat        0
cut          0
color        0
clarity      0
depth      585
table        0
price        0
volume       0
dtype: int64
carat        0
cut          0
color        0
clarity      0
depth      180
table        0
price      472
volume       0
dtype: int64


In [18]:
#dropping the upper outliers
data = data[~((data > ul).any(axis=1) | (data < ll).any(axis=1))]
data.shape

(44529, 8)

In [19]:
#calculating the quantiles q1, q3
Q_1 = data.quantile(0.25)
Q_3 = data.quantile(0.75)
print(Q_1)
print(Q_3)

carat       0.360000
depth      61.200000
table      56.000000
price     870.000000
volume     59.929794
Name: 0.25, dtype: float64
carat        1.000000
depth       62.500000
table       59.000000
price     4326.000000
volume     160.468581
Name: 0.75, dtype: float64


In [20]:
#calculating the IQR
iqr1 = Q_3-Q_1
iqr1

carat        0.640000
depth        1.300000
table        3.000000
price     3456.000000
volume     100.538787
dtype: float64

In [21]:
#calculating the upper limit
ul = Q_3+(1.5*iqr1)
ll = Q_1-(1.5*iqr1)

In [22]:
#count of upper quantiles
print(data[data<ll].count())
print(data[data>ul].count())

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
volume     0
dtype: int64
carat        0
cut          0
color        0
clarity      0
depth        0
table        0
price      209
volume       0
dtype: int64


In [23]:
#5 point summary
data.describe()

Unnamed: 0,carat,depth,table,price,volume
count,44529.0,44529.0,44529.0,44529.0,44529.0
mean,0.678552,61.830546,57.200624,2810.214894,110.596655
std,0.345708,1.026103,2.000676,2353.554612,55.78535
min,0.2,59.3,52.0,326.0,31.707984
25%,0.36,61.2,56.0,870.0,59.929794
50%,0.57,61.9,57.0,1911.0,93.416706
75%,1.0,62.5,59.0,4326.0,160.468581
max,1.91,64.4,63.3,9724.0,301.701959


In [24]:
#seperating the numeric and categorical features

#numeric Features
df_num = data.select_dtypes(include=['int','float'])
#categorical features
df_cat = data.select_dtypes(include='object')

In [25]:
#Ordinal Encoding
#Replacing the categorical value colour, cut and clarity without using built in function for categorical data
df_cat=df_cat.replace({'color' : { 'D' : 6, 'E' : 5, 'F' : 4, 'G' : 3, 'H': 2, 'I':1, 'J':0}})
df_cat=df_cat.replace({'cut': {'Ideal':4, 'Premium': 3, 'Very Good':2, 'Good':1, 'Fair':0}})
df_cat=df_cat.replace({'clarity': {"IF": 8, 'VVS1' :7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1':3, 'SI2': 2, 'I1':1, 'I2':0, 'I3':0}})
#Visualize the data frame
df_cat.head()

Unnamed: 0,cut,color,clarity
0,4,5,2
1,3,5,3
3,3,1,4
4,1,0,2
5,2,0,6


In [26]:
#concatinating the num and cat
df = pd.concat([df_num, df_cat], axis = 1)
df.head()

Unnamed: 0,carat,depth,table,price,volume,cut,color,clarity
0,0.23,61.5,55.0,326,38.20203,4,5,2
1,0.21,59.8,61.0,326,34.505856,3,5,3
3,0.29,62.4,58.0,334,46.72458,3,1,4
4,0.31,63.3,58.0,335,51.91725,1,0,2
5,0.24,62.8,57.0,336,38.693952,2,0,6


In [27]:
#label
Y = data['price']

In [28]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(df,Y, test_size= 0.2)

In [29]:
#predecting the by using ML algorithm
Lr_model = LinearRegression() #Create a linear regression model
Lr_model.fit(X_train,y_train) #fitting the data

LinearRegression()

In [30]:
#Predict the value of y based on the train data
y_train_pred =  Lr_model.predict(X_train)

In [31]:
#performance metrics on train data 
MAPE_train = mean_absolute_percentage_error(y_train, y_train_pred)
MSE_train = mean_squared_error(y_train, y_train_pred)
RMSE_train = np.sqrt(MSE_train)
print("Mean Absolute Percentage Error", MAPE_train)
print("Mean Square Error is", MSE_train)
print("Root Mean Square Error is", RMSE_train)

Mean Absolute Percentage Error 2.6409557856398098e-15
Mean Square Error is 1.4529941689953266e-23
Root Mean Square Error is 3.811816061925505e-12


In [32]:
#Predict the value of y based on the test data
y_test_pred =  Lr_model.predict(X_test)

In [33]:
#performance metrics on train data 
MAPE_test = mean_absolute_percentage_error(y_test, y_test_pred)
MSE_test = mean_squared_error(y_test, y_test_pred)
RMSE_test = np.sqrt(MSE_test)
print("Mean Absolute Percentage Error", MAPE_test)
print("Mean Square Error is", MSE_test)
print("Root Mean Square Error is", RMSE_test)

Mean Absolute Percentage Error 2.642350993648762e-15
Mean Square Error is 1.4146551870054387e-23
Root Mean Square Error is 3.7611902198711496e-12


##### Pickle File

In [34]:
import pickle

# Save the trained model as a pickle file
with open('Prediction_model.pkl', 'wb') as f:
    pickle.dump(Lr_model, f)