In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt


In [2]:
# Predicting the price of diamonds using specific attributes, without having its price. Regression model. Kaggle dataset

df = pd.read_csv('diamonds.csv',index_col=0) # removing the useless index

df['cut'].unique() 

cut_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5} # putting categorical cut values into a dict

#'Ideal' > 'Good' # Yay

df['clarity'].unique()

clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}

color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7} # creating dicts for clarity and color as well

# must now map these dict values to replace their asssociated columns in the df

df['cut'] = df['cut'].map(cut_dict) # the dataframe(column) of "cut" is mapped to the cut dictionary

df['color'] = df['color'].map(color_dict) # same for color and clarity 

df['clarity'] = df['clarity'].map(clarity_dict)

df.head()







Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [3]:
from sklearn import svm, preprocessing # the fun part starts here 

df = sklearn.utils.shuffle(df) # always a good idea to shuffle your data to avoid ordering biases 

X = df.drop('price', axis=1).values # X = features everything we want other than price, because including it would
# defeat the purpose of a regression, duhhh

y = df['price'].values # y = labels, in other words, everything the features are trying to predict

X = preprocessing.scale(X) # generally in ML it is better to scale/standardise inputs in order for easier fitting by the model
X

# as with any ML problem, data must be split between training, test, and validation sets

test_size = 200

X_train = X[:-test_size] # X training data will be up to the last 200 of the test size

y_train = y[:-test_size]

X_test = X[-test_size:] # X test data will be the last 200 of the test size 

y_test = y[-test_size:]

clf = svm.SVR(kernel='linear') # defining the classifer that will be used

clf.fit(X_train, y_train) # training the data so it can get 'fit' haha!






SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [4]:
clf.score(X_test, y_test)*100 # how well did our model do? in other words, how accurate were the features in
# the features in predicting the labels







83.86312594437038

In [5]:
# to see exactly how much our model's estimates varied from the actual true values of price, and creating a list

# some estimated values are negative which should never be the case, a voting classifer would be used in order to maximise r^2
for X,y in list(zip(X_test, y_test)):
    print(f"model:{clf.predict([X])[0]}, True value: {y}") 

model:4830.243399173057, True value: 5047
model:4115.019830488836, True value: 4463
model:4842.6361958678035, True value: 4999
model:2059.1236332391563, True value: 2041
model:5703.270080149425, True value: 8688
model:5611.979872579339, True value: 5082
model:3920.188086968633, True value: 3444
model:2399.096032132116, True value: 2081
model:6216.327112775907, True value: 6545
model:1353.852477262128, True value: 945
model:-662.4811451980472, True value: 391
model:4893.6871099585705, True value: 4604
model:14.211686960555653, True value: 453
model:9030.303926874232, True value: 11333
model:2286.7142754433307, True value: 2321
model:5467.305038894455, True value: 4170
model:2951.6001434469067, True value: 2681
model:2919.8192311573794, True value: 2415
model:2354.4746096197814, True value: 2053
model:3189.9250812898304, True value: 2712
model:9066.52388467207, True value: 11088
model:8712.234126048521, True value: 11442
model:9676.334337561097, True value: 10600
model:571.6219673736232,