In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [37]:
uri = "https://gist.githubusercontent.com/guilhermesilveira/4d1d4a16ccbf6ea4e0a64a38a24ec884/raw/afd05cb0c796d18f3f5a6537053ded308ba94bf7/car-prices.csv"
data = pd.read_csv(uri)
data.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [38]:
swap = {
    'yes' : 1,
    'no' : 0
}
data.sold = data.sold.map(swap)
data.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,1
1,1,7843,1998,40557.96,1
2,2,7109,2006,89627.5,0
3,3,26823,2015,95276.14,0
4,4,7935,2014,117384.68,1


In [39]:
from datetime import datetime

actual_year = datetime.today().year
data['model_age'] = actual_year - data.model_year
data.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold,model_age
0,0,21801,2000,30941.02,1,22
1,1,7843,1998,40557.96,1,24
2,2,7109,2006,89627.5,0,16
3,3,26823,2015,95276.14,0,7
4,4,7935,2014,117384.68,1,8


In [40]:
data['kilometers_per_year'] = data.mileage_per_year * 1.60934
data.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold,model_age,kilometers_per_year
0,0,21801,2000,30941.02,1,22,35085.22134
1,1,7843,1998,40557.96,1,24,12622.05362
2,2,7109,2006,89627.5,0,16,11440.79806
3,3,26823,2015,95276.14,0,7,43167.32682
4,4,7935,2014,117384.68,1,8,12770.1129


In [41]:
data = data.drop(columns = ["Unnamed: 0", "mileage_per_year", "model_year"], axis=1)
data.head()

Unnamed: 0,price,sold,model_age,kilometers_per_year
0,30941.02,1,22,35085.22134
1,40557.96,1,24,12622.05362
2,89627.5,0,16,11440.79806
3,95276.14,0,7,43167.32682
4,117384.68,1,8,12770.1129


In [42]:
x = data[["price", "model_age", "kilometers_per_year"]]
y = data["sold"]

SEED = 5
np.random.seed(SEED)

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25, stratify = y)
print("We'll train with %d elements and test with %d elements." % (len(train_x), len(test_x)))

model = LinearSVC()
model.fit(train_x, train_y)
predictions = model.predict(test_x)

accuracy = accuracy_score(test_y, predictions) * 100
print("Accuracy was %.2f%%" % accuracy)

We'll train with 7500 elements and test with 2500 elements.
Accuracy was 46.88%




In [43]:
from sklearn.dummy import DummyClassifier

dummy_stratified = DummyClassifier(strategy='stratified')

dummy_stratified.fit(train_x, train_y)
accuracy = dummy_stratified.score(test_x, test_y) * 100

print("dummy_stratified's accuracy was %.2f%%" % accuracy)

dummy_stratified's accuracy was 52.44%


In [44]:
from sklearn.dummy import DummyClassifier

dummy_mostfrequent = DummyClassifier(strategy='most_frequent')

dummy_mostfrequent.fit(train_x, train_y)
accuracy = dummy_mostfrequent.score(test_x, test_y) * 100

print("dummy_mostfrequent's accuracy was %.2f%%" % accuracy)

dummy_mostfrequent's accuracy was 58.00%


In [45]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

SEED = 5
np.random.seed(SEED)

raw_train_x, raw_test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25, stratify = y)
print("We'll train with %d elements and test with %d elements." % (len(train_x), len(test_x)))

scaler = StandardScaler()
scaler.fit(raw_train_x)
train_x = scaler.transform(raw_train_x)
test_x = scaler.transform(raw_test_x)

model = SVC()
model.fit(train_x, train_y)
predictions = model.predict(test_x)

accuracy = accuracy_score(test_y, predictions) * 100
print("Accuracy was %.2f%%" % accuracy)

We'll train with 7500 elements and test with 2500 elements.
Accuracy was 77.48%


## Decision Tree Classifier

In [46]:
!pip install graphviz==0.20.1
# !apt-get install graphviz # UBUNTU 21.04 SUCKS - THANKS FOR THE EXPERIENCE CIRSSO

Defaulting to user installation because normal site-packages is not writeable


In [49]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn .tree import export_graphviz
import matplotlib.pyplot as plt
import graphviz

SEED = 5
np.random.seed(SEED)

raw_train_x, raw_test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25,
                                                            stratify = y)
print("We'll train with %d elements and test with %d elements." % (len(train_x), len(test_x)))

model = DecisionTreeClassifier(max_depth=3)
model.fit(raw_train_x, train_y)
predictions = model.predict(raw_test_x)

accuracy = accuracy_score(test_y, predictions) * 100
print("Accuracy was %.2f%%" % accuracy)

features = x.columns
dot_data = export_graphviz(model, out_file=None,
                           filled = True, rounded = True,
                           feature_names = features,
                          class_names = ["no", "yes"])
graphic = graphviz.Source(dot_data)

print(graphic)

We'll train with 7500 elements and test with 2500 elements.
Accuracy was 79.28%
digraph Tree {
node [shape=box, style="filled, rounded", color="black", fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label="price <= 59999.074\ngini = 0.487\nsamples = 7500\nvalue = [3150, 4350]\nclass = yes", fillcolor="#c8e4f8"] ;
1 [label="price <= 40070.154\ngini = 0.202\nsamples = 3248\nvalue = [370, 2878]\nclass = yes", fillcolor="#52aae8"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="gini = 0.0\nsamples = 1441\nvalue = [0, 1441]\nclass = yes", fillcolor="#399de5"] ;
1 -> 2 ;
3 [label="price <= 40723.646\ngini = 0.326\nsamples = 1807\nvalue = [370, 1437]\nclass = yes", fillcolor="#6cb6ec"] ;
1 -> 3 ;
4 [label="gini = 0.441\nsamples = 61\nvalue = [20, 41]\nclass = yes", fillcolor="#9acdf2"] ;
3 -> 4 ;
5 [label="gini = 0.321\nsamples = 1746\nvalue = [350, 1396]\nclass = yes", fillcolor="#6bb6ec"] ;
3 -> 5 ;
6 [label="kilometers_per_year <= 24112.741\ngini = 0.453