In [46]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [47]:
df = pd.read_csv ('diamonds.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,cut_ord,color,clarity,clarity_ord,price
0,1,0.51,Premium,4,F,VS1,4,1749
1,2,2.25,Fair,1,G,I1,1,7069
2,3,0.7,Very Good,3,E,VS2,5,2757
3,4,0.47,Good,2,F,VS1,4,1243
4,5,0.3,Ideal,5,G,VVS1,7,789


In [48]:
df.shape

(50000, 8)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   50000 non-null  int64  
 1   carat        50000 non-null  float64
 2   cut          50000 non-null  object 
 3   cut_ord      50000 non-null  int64  
 4   color        50000 non-null  object 
 5   clarity      50000 non-null  object 
 6   clarity_ord  50000 non-null  int64  
 7   price        50000 non-null  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 3.1+ MB


In [50]:
df.describe ()

Unnamed: 0.1,Unnamed: 0,carat,cut_ord,clarity_ord,price
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,25000.5,0.798597,3.90398,4.1267,3939.1035
std,14433.901067,0.474651,1.117043,1.665564,3995.879832
min,1.0,0.2,1.0,1.0,326.0
25%,12500.75,0.4,3.0,3.0,948.0
50%,25000.5,0.7,4.0,4.0,2402.5
75%,37500.25,1.04,5.0,5.0,5331.0
max,50000.0,5.01,5.0,8.0,18823.0


In [51]:
df.isnull().sum()

Unnamed: 0     0
carat          0
cut            0
cut_ord        0
color          0
clarity        0
clarity_ord    0
price          0
dtype: int64

In [52]:
df.duplicated().sum()

0

In [53]:
df = df.drop (columns = ['Unnamed: 0', 'cut_ord', 'clarity_ord'])
df.head ()

Unnamed: 0,carat,cut,color,clarity,price
0,0.51,Premium,F,VS1,1749
1,2.25,Fair,G,I1,7069
2,0.7,Very Good,E,VS2,2757
3,0.47,Good,F,VS1,1243
4,0.3,Ideal,G,VVS1,789


In [54]:
df.cut.unique()

array(['Premium', 'Fair', 'Very Good', 'Good', 'Ideal'], dtype=object)

In [55]:
df.color.unique()

array(['F', 'G', 'E', 'D', 'H', 'J', 'I'], dtype=object)

In [56]:
df.clarity.unique()

array(['VS1', 'I1', 'VS2', 'VVS1', 'SI1', 'VVS2', 'SI2', 'IF'],
      dtype=object)

In [57]:
df ['cut'] = df ['cut'].map({'Premium':0, 'Fair': 1, 'Very Good':2, 'Good':3, 'Ideal':4 })
df ['color'] = df ['color'].map({'F':0, 'G': 1, 'E':2, 'D':3, 'H':4, 'J':5, 'I':6})
df ['clarity'] = df ['clarity'].map({'VS1':0, 'I1':1, 'VS2':2, 'VVS1':3, 'SI1':4, 'VVS2':5, 'SI2':6, 'IF':7 })

In [58]:
df.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.51,0,0,0,1749
1,2.25,1,1,1,7069
2,0.7,2,2,2,2757
3,0.47,3,0,0,1243
4,0.3,4,1,3,789


In [59]:
x = df.drop (columns = ['price'])
y = df ['price']
print(x.shape,y.shape)

(50000, 4) (50000,)


In [60]:
y.head()

0    1749
1    7069
2    2757
3    1243
4     789
Name: price, dtype: int64

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.2, random_state = 40)

In [63]:
print (x.shape, x_train.shape, x_test.shape)

(50000, 4) (40000, 4) (10000, 4)


In [64]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression ()
lr.fit (x_train, y_train)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor ()
rf.fit (x_train, y_train)

from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor ()
gbr.fit (x_train, y_train)

from xgboost import XGBRegressor
xg = XGBRegressor ()
xg.fit (x_train, y_train)

from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit (x_train, y_train)


In [65]:
print ('Linear Regression Train Score is', lr.score (x_train, y_train)* 100)
print ('Random Forest Regression Train Score is', rf.score (x_train, y_train)* 100)
print ('Gradient Boosting Regression Train Score is', gbr.score (x_train, y_train)* 100)
print ('XGBoost Regression Train Score is', xg.score (x_train, y_train)* 100)
print ('Decision Tree Regression Train Score is', tree.score (x_train, y_train)* 100)

Linear Regression Train Score is 86.3454018795412
Random Forest Regression Train Score is 99.07261717232684
Gradient Boosting Regression Train Score is 96.0972889714953
XGBoost Regression Train Score is 98.47749491088773
Decision Tree Regression Train Score is 99.19282808654188


In [66]:
print ('Linear classifier Test Score is', lr.score (x_test, y_test) * 100)
print ('Random Forest classifier Test Score is', rf.score (x_test, y_test)* 100)
print ('Gradient Boosting classifier Test Score is', gbr.score (x_test, y_test)* 100)
print ('XGBoost classifier Test Score is', xg.score (x_test, y_test)* 100)
print ('Decision Tree classifier Test Score is', tree.score (x_test, y_test)* 100)

Linear classifier Test Score is 86.37574908783878
Random Forest classifier Test Score is 97.73639536355157
Gradient Boosting classifier Test Score is 95.9630959621831
XGBoost classifier Test Score is 97.99156102630384
Decision Tree classifier Test Score is 97.09058320390022


In [67]:
print ('Random Forest classifier Test Score is', rf.score (x_test, y_test)* 100)

Random Forest classifier Test Score is 97.73639536355157


In [68]:
y_pred = lr.predict (x_test)
y_pred2 = rf.predict (x_test)
y_pred3 = gbr.predict (x_test)
y_pred4 = xg.predict (x_test)
y_pred5 = tree.predict (x_test)

In [69]:
from sklearn import metrics

In [70]:
score1 = metrics.r2_score (y_test, y_pred1)
score2 = metrics.r2_score (y_test, y_pred2)
score3 = metrics.r2_score (y_test, y_pred3)
score4 = metrics.r2_score (y_test, y_pred4)
score5 = metrics.r2_score (y_test, y_pred5)

In [71]:
print (score1, score2, score3, score4, score5)

0.8637574908783878 0.9773639536355156 0.959630959621831 0.9799156102630384 0.9709058320390023


In [72]:
final_data = pd.DataFrame ({'Models': ['lr','rc','gbc', 'XG', 'DT'],
                          'R2_SCORE': [score1,score2,score3,score4,score5]})
final_data

Unnamed: 0,Models,R2_SCORE
0,lr,0.863757
1,rc,0.977364
2,gbc,0.959631
3,XG,0.979916
4,DT,0.970906


In [73]:
# xg = Xg ()
# xg_final = xg.fit (x,y)

In [74]:
xg.fit(x,y)

In [75]:
import joblib

In [76]:
joblib.dump (xg, 'diamondd')

['diamondd']

In [77]:
model = joblib.load ('diamondd')

In [78]:
data_new = pd.DataFrame ({
    'carat':1.22,
    'cut': 0,
    'color':1,
    'clarity':4
}, index = [0])

In [79]:
model.predict (data_new)

array([6325.867], dtype=float32)

In [80]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [82]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 982.3794353055251
Mean Squared Error: 2243381.731722405
Root Mean Squared Error: 1497.792285906963
R-squared: 0.8637574908783878


In [None]:
# xg_final.save_model ('diamond_xgb_model.json')

In [None]:
from tkinter import *
import joblib

def show_entry_fields():
    p1=float(e1.get())
    p2=float(e2.get())
    p3=float(e3.get())
    p4=float(e4.get())
    
    
    model = joblib.load('diamondd')
    data_new = pd.DataFrame ({
    'carat':1.22,
    'cut': 0,
    'color':1,
    'clarity':4
}, index = [0])
    result=model.predict(data_new)
    Label(master, text="Diamond Purchase amount").grid(row=5)
    Label(master, text=result).grid(row=6)
    print("Car Purchase amount", result[0])
    
master = Tk()
master.title("Diamond Price Prediction Using Machine Learning")
label = Label(master, text = "Diamond Price Prediction Using Machine Learning"
                          , bg = "black", fg = "white"). \
                               grid(row=0,columnspan=2)


Label(master, text="carat").grid(row=1)
Label(master, text="cut").grid(row=2)
Label(master, text="color").grid(row=3)
Label(master, text="clarity").grid(row=4)


e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)


e1.grid(row=1, column=1)
e2.grid(row=2, column=1)
e3.grid(row=3, column=1)
e4.grid(row=4, column=1)


Button(master, text='Predict', command=show_entry_fields).grid()

mainloop()

In [None]:
from sklearn import *