In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC

In [2]:
data = pd.read_csv('./Data/data.csv')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,2.26,Ideal,G,SI2,61.9,57.0,8.44,8.36,5.2,12831
1,2.43,Very Good,H,SI2,63.2,57.0,8.56,8.5,5.39,16170
2,0.8,Premium,F,SI2,61.0,57.0,6.03,6.01,3.67,2797
3,0.4,Ideal,F,I1,63.3,60.0,4.68,4.64,2.95,630
4,0.31,Ideal,G,VS2,61.6,55.0,4.39,4.37,2.7,698


In [3]:
print(data['cut'].value_counts())
print(data['color'].value_counts())
print(data['clarity'].value_counts())

Ideal        16139
Premium      10377
Very Good     9101
Good          3650
Fair          1188
Name: cut, dtype: int64
G    8492
E    7343
F    7183
H    6230
D    5046
I    4046
J    2115
Name: color, dtype: int64
SI1     9767
VS2     9147
SI2     6909
VS1     6157
VVS2    3822
VVS1    2740
IF      1356
I1       557
Name: clarity, dtype: int64


In [4]:
data = pd.get_dummies(data)
data.head()

Unnamed: 0,carat,depth,table,x,y,z,price,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,2.26,61.9,57.0,8.44,8.36,5.2,12831,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,2.43,63.2,57.0,8.56,8.5,5.39,16170,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.8,61.0,57.0,6.03,6.01,3.67,2797,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.4,63.3,60.0,4.68,4.64,2.95,630,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,0.31,61.6,55.0,4.39,4.37,2.7,698,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [5]:
y = data.price
X = data.drop(['price'],inplace=False,axis=1)

In [6]:
steps = [
    StandardScaler(),
    Normalizer(),
    PCA(n_components=3)
]
pipe = make_pipeline(*steps)
X = pipe.fit_transform(X)
df = pd.DataFrame(X)
df.head()

Unnamed: 0,0,1,2
0,0.75462,-0.491905,0.178676
1,0.843361,-0.186794,-0.109295
2,0.28994,0.268313,0.223631
3,-0.163138,-0.016487,-0.017537
4,-0.646815,-0.290725,0.416952


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
SVC = LinearSVC()
clf = SVC.fit(X, y)
pred = clf.predict(X_test)
scores = pd.DataFrame({
    "predicted":pred,
    "gt":y_test
})
scores["diff"] = np.abs(scores["predicted"]-scores["gt"])
scores["diff"].sum()/ len(y_test)
scores

In [26]:
y_score = clf.predict_proba(X_test)

#print(y_score)
roc = roc_curve(y_test, y_score[:,1])
display(pd.DataFrame({
    "predicted":y_test,
    "probaLife":y_score[:,1],
    "probaDead":y_score[:,0]
}).head())

plt.plot(roc[0],roc[1])
plt.legend([f"AUC: {round(roc_auc_score(y_test, y_score[:,1]),2)}"])

NameError: name 'clf' is not defined