In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
import os
os.chdir('C:\\Users\\admin\\ml_deployment\\model_deploy_heroku')

In [3]:
df = pd.read_csv('diamonds.csv')

#### Content

- price: price in US dollars (\$326--\$18,823)

- carat: weight of the diamond (0.2--5.01)

- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)

- color: diamond colour, from J (worst) to D (best)

- clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

- x: length in mm (0--10.74)

- y: width in mm (0--58.9)

- z: depth in mm (0--31.8)

- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

- table: width of top of diamond relative to widest point (43--95)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
cut_list = list(df.cut.value_counts().index)
cut_list

['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [6]:
cut_dict = {}
for i,v in enumerate(cut_list):
    cut_dict[v] = i+1

In [7]:
cut_dict

{'Ideal': 1, 'Premium': 2, 'Very Good': 3, 'Good': 4, 'Fair': 5}

In [8]:
color_list = list(df.color.value_counts().index.sort_values())
color_list

['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [9]:
color_dict = {}
for i,v in enumerate(color_list):
    color_dict[v] = i+1

In [10]:
color_dict

{'D': 1, 'E': 2, 'F': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7}

In [11]:
clarity_list = list(df.clarity.value_counts().index)
clarity_list

['SI1', 'VS2', 'SI2', 'VS1', 'VVS2', 'VVS1', 'IF', 'I1']

In [12]:
clarity_dict = {'SI1':6, 'VS2':5, 'SI2':7, 'VS1':4, 'VVS2':3, 'VVS1':2, 'IF':1, 'I1':8}
clarity_dict

{'SI1': 6,
 'VS2': 5,
 'SI2': 7,
 'VS1': 4,
 'VVS2': 3,
 'VVS1': 2,
 'IF': 1,
 'I1': 8}

In [13]:
df['cut_code'] = df['cut'].map(cut_dict)
df['color_code'] = df['color'].map(color_dict)
df['clarity_code'] = df['clarity'].map(clarity_dict)

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_code,color_code,clarity_code
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1,2,7
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,2,2,6
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,4,2,4
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,2,6,5
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,4,7,7


In [15]:
df.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z', 'cut_code', 'color_code', 'clarity_code'],
      dtype='object')

In [16]:
x = df[['carat', 'depth', 'table', 'color_code', 'cut_code', 'clarity_code']]
y = df.price

In [17]:
model = LinearRegression().fit(x,y)

In [18]:
model.score(x,y)

0.9043956047696529

In [19]:
model.predict([[.5,65,58,2,3,4]])

array([2046.067526])

In [20]:
import pickle

In [21]:
pickle.dump(model, open('diamond_price_model.pkl','wb'))

In [22]:
diamond_model = pickle.load(open('diamond_price_model.pkl', 'rb'))

In [23]:
diamond_model.predict([[.5,65,58,2,3,4]])

array([2046.067526])

In [24]:
np.array([.5,65,58,2,3,4]).reshape(1,-1)

array([[ 0.5, 65. , 58. ,  2. ,  3. ,  4. ]])