In [23]:
import pandas as pd
import sklearn
from sklearn import svm, preprocessing

In [24]:
FILE = 'diamonds.csv'
df = pd.read_csv(FILE, index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [25]:
# just transform category into numbers, no meaning or relation to the correct sequence
df['cut'].astype('category').cat.codes

1        2
2        3
3        1
4        3
5        1
        ..
53936    2
53937    1
53938    4
53939    3
53940    2
Length: 53940, dtype: int8

In [26]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [27]:
df['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [28]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [29]:
# to transform into numbers with regards to meaning and sequence
cut_dict = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_dict = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
clarity_dict = {'I1': 1 , 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}

In [30]:
%%time 
#using apply lambda
df['cut'] = df['cut'].apply(lambda c: cut_dict[c])

Wall time: 15.6 ms


In [31]:
%%time 
#using map
df['color'] = df['color'].map(color_dict)

Wall time: 0 ns


In [32]:
%%time
df['clarity'] = df['clarity'].map(clarity_dict)

Wall time: 0 ns


In [33]:
#shuffle dataframe to avoid bias in model
df = sklearn.utils.shuffle(df)

In [34]:
# X & y for the model
# X: all the relevent data required for the predictuion
# y: the values to be predicted by the model
X = df.drop("price", axis=1).values
# Scale X to produce more uniform values
X = preprocessing.scale(X)
y = df["price"].values

In [35]:
# define test sample, to test the model after it finishes. the sample will never be fed to training the model
test_sample = 100 
#since data is already shuffled, taking last 200 is OK as a random sample
X_train = X[:-test_sample]
y_train = y[:-test_sample]

X_test = X[-test_sample:]
y_test = y[-test_sample:]

In [36]:
# train the the model for prediction
clf = svm.SVR(kernel = 'linear')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8499457112953045

In [37]:
# test the model with the test sample
for x, y in zip(X_test, y_test):
    prediction = clf.predict([x])[0]
    if prediction < 0:
        print('!!! Invalid Value')
    print('Model: {}, Actual: {}'.format(prediction, y))

Model: 1189.8342425889778, Actual: 1237
Model: 3302.422829901419, Actual: 3053
Model: 1692.2629960288402, Actual: 1607
Model: 5905.754039311994, Actual: 7805
Model: 263.0556701777746, Actual: 625
Model: 2864.064263207377, Actual: 2227
Model: 533.5302474382065, Actual: 928
Model: 6236.127469267393, Actual: 8057
Model: 6391.044549556018, Actual: 7416
Model: 8601.337800081184, Actual: 7416
Model: 6476.639078826545, Actual: 9068
Model: 13299.775893324564, Actual: 14277
Model: 4496.348995651122, Actual: 4225
Model: 8531.615896775898, Actual: 9352
Model: 4392.26522676359, Actual: 4212
Model: 1476.6573184991535, Actual: 1393
Model: 739.3269060620005, Actual: 972
Model: 4533.476688077515, Actual: 4306
Model: 4739.9993846347115, Actual: 4899
Model: 5362.989036060603, Actual: 6697
Model: 8792.53964974929, Actual: 12100
Model: 1340.5716212199877, Actual: 1229
!!! Invalid Value
Model: -34.12260266553767, Actual: 949
Model: 8031.123641679766, Actual: 7368
Model: 105.7518552919787, Actual: 629
Model