In [1]:
import pandas as pd

df = pd.read_csv('../datasets/diamonds.csv', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
cut_class_dict = {
    'Fair': 1,
    'Good': 2,
    'Very Good': 3,
    'Premium': 4,
    'Ideal': 5
}
clarity_dict = {
    "I3": 1, 
    "I2": 2, 
    "I1": 3, 
    "SI2": 4, 
    "SI1": 5, 
    "VS2": 6, 
    "VS1": 7, 
    "VVS2": 8, 
    "VVS1": 9, 
    "IF": 10, 
    "FL": 11
}
color_dict = {
    "J": 1,
    "I": 2,
    "H": 3,
    "G": 4,
    "F": 5,
    "E": 6,
    "D": 7
}

df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [6]:
import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df)

X = df.drop('price', axis=1).values
X = preprocessing.scale(X)
y = df['price'].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR(kernel='linear')
clf.fit(X_train, y_train)

SVR(kernel='linear')

In [7]:
clf.score(X_test, y_test)

0.8759130771370137

In [9]:
for X, y in zip(X_test, y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: 5372.854590153892, Actual: 6504
Model: 5485.176057103326, Actual: 7528
Model: 1857.236219232375, Actual: 1662
Model: 213.60366406181947, Actual: 855
Model: 203.39847473998498, Actual: 683
Model: 4567.22791704505, Actual: 4036
Model: -42.93064758545415, Actual: 507
Model: 3644.7615960679213, Actual: 3007
Model: 11548.357497796684, Actual: 9999
Model: 12558.46995682185, Actual: 14452
Model: 7196.022165063072, Actual: 7747
Model: -500.12761189702996, Actual: 574
Model: 935.512345480794, Actual: 823
Model: 2428.877903822049, Actual: 2063
Model: 5763.786867373501, Actual: 6387
Model: 2428.33442012243, Actual: 2161
Model: 3830.134850760414, Actual: 3084
Model: 9276.002936108911, Actual: 15466
Model: 1811.3768148055512, Actual: 1235
Model: 6222.757671290325, Actual: 7372
Model: 1287.2354787721292, Actual: 1155
Model: 884.0325072788482, Actual: 750
Model: 1684.3143133796614, Actual: 1815
Model: 2457.1382272742667, Actual: 2294
Model: 4269.57503274378, Actual: 3511
Model: 5035.2858752204

In [10]:
clf = svm.SVR(kernel='rbf')
clf.fit(X_train, y_train)

SVR()

In [11]:
clf.score(X_test, y_test)

0.5700863288418798

In [12]:
for X, y in zip(X_test, y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: 5586.245555588832, Actual: 6504
Model: 4703.770401905743, Actual: 7528
Model: 1522.357949536243, Actual: 1662
Model: 566.2164407983796, Actual: 855
Model: 701.9062778158632, Actual: 683
Model: 4198.984815240435, Actual: 4036
Model: 572.772745197854, Actual: 507
Model: 3103.8264241858114, Actual: 3007
Model: 5431.462192920683, Actual: 9999
Model: 6029.269083593934, Actual: 14452
Model: 6735.625728204362, Actual: 7747
Model: 1498.5530514896082, Actual: 574
Model: 880.2135091776131, Actual: 823
Model: 2238.0301422083894, Actual: 2063
Model: 5948.830145577171, Actual: 6387
Model: 2197.0954377141406, Actual: 2161
Model: 3484.1828899923253, Actual: 3084
Model: 7203.83142105516, Actual: 15466
Model: 1314.5647124873665, Actual: 1235
Model: 6069.7489206753, Actual: 7372
Model: 1316.8637533901222, Actual: 1155
Model: 947.7759267915358, Actual: 750
Model: 1447.9354313130734, Actual: 1815
Model: 2361.3172917249162, Actual: 2294
Model: 3561.511437044405, Actual: 3511
Model: 5343.413828009241