In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

print(pd.__version__)


0.24.2


In [2]:
df = pd.read_csv("datasets/diamonds.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
# Now, we have to quantify the strings into numbers for the machine learning algorithm to interpret it
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [4]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5} # Hard coding the order

In [5]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [6]:
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [7]:
# mapping the required dictionary values to the main csv file content
df['cut'] = df['cut'].map(cut_class_dict) 
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [8]:
# Now that all the data is quantified, we shall now implement the Machine Learning Algorithm
# Library used: scikit learn

import sklearn
from sklearn.linear_model import SGDRegressor

In [9]:
df = sklearn.utils.shuffle(df) # Shuffling the data to avoid unnecessary patterns to be recognized by the algorithm
X = df.drop("price", axis=1).values
y = df["price"].values

In [10]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [11]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))



-6.000343230284148e+24


In [12]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

1.3130406929196004e+16 1273
1.281962938235121e+16 1195
1.288588345248633e+16 1211
1.4658624528325306e+16 530
416679738328290.44 3010
1.4322673117248836e+16 1718
1.5513270076613472e+16 2149
1.0624945356884466e+16 852
1322458203011127.5 3618
2250214577400780.2 4228


In [13]:
# The answers we are getting are weird, since normally the value is between 0 and 100
from sklearn import svm # Using Support Vector Machines
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))



-8.526093545946526e+25


In [14]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

-4.944305006246719e+16 1273
-4.828231103296829e+16 1195
-4.851221362767593e+16 1211
-5.5196562976182936e+16 530
-1836894927416091.0 3010
-5.390243839174032e+16 1718
-5.841648181700981e+16 2149
-4.005494189988217e+16 852
-5191559611205760.0 3618
-8687178924609589.0 4228


In [15]:
# overall:

import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

0.5560438691264782
model predicts 5742.478422754121, real value: 4278
model predicts 5720.933392073917, real value: 8391
model predicts 5509.594682353778, real value: 7078
model predicts 2377.526777558537, real value: 1926
model predicts 6241.677750390178, real value: 12674
model predicts 1524.319648416521, real value: 1752
model predicts 3004.2233946469187, real value: 3316
model predicts 1156.9188075076938, real value: 583
model predicts 544.8189676287207, real value: 665
model predicts 2070.2088338235076, real value: 2009
