In [25]:
from math import sqrt
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB


warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('../carRisk.csv')
data.head()

Unnamed: 0,age,cartype,risk
0,20,combi,high
1,18,sport,high
2,40,sport,high
3,50,family,low
4,35,minivan,low


In [4]:
data.shape

(40, 3)

In [8]:
data['cartype'].unique()

array([0, 1, 2, 3], dtype=int64)

In [9]:
data['risk'].unique()

array([0, 1], dtype=int64)

In [10]:
data['cartype'] = pd.factorize(data['cartype'])[0]
data['risk'] = pd.factorize(data['risk'])[0]

data.head()

Unnamed: 0,age,cartype,risk
0,20,0,0
1,18,1,0
2,40,1,0
3,50,2,1
4,35,3,1


### Feature Engineering / Scaling

In [43]:
sklr = MinMaxScaler()
sklr_ft = sklr.fit_transform(
    data[[
        'age',
        'cartype',
        'risk'
    ]]
)
skl_data = pd.DataFrame(sklr_ft, index = data.index, columns = data.columns)

x, y = data.drop('risk', axis=1), data['risk']

xtrain, xtest, ytrain, ytest = train_test_split(
    x, y,
    test_size=0.2,
    random_state=42,
)

model_dict = {
    'regresion-logistica': LogisticRegression().fit(xtrain, ytrain),
    'decision-tree': DecisionTreeClassifier().fit(xtrain, ytrain),
    'random-forest': RandomForestClassifier().fit(xtrain, ytrain),
    'boosting-algo': GradientBoostingClassifier().fit(xtrain, ytrain),
    'gaussianNB': GaussianNB().fit(xtrain, ytrain), 
    'multinomialNB': MultinomialNB().fit(xtrain, ytrain),
    'bernoulliNB': BernoulliNB().fit(xtrain, ytrain) ####
}

In [44]:
model_dict['decision-tree'].get_depth()

3

In [45]:
scores = {
    'regresion-logistica': model_dict['regresion-logistica'].predict(xtest),
    'decision-tree': model_dict['decision-tree'].predict(xtest),
    'random-forest': model_dict['random-forest'].predict(xtest),
    'boosting-algo': model_dict['boosting-algo'].predict(xtest),
    'gaussianNB': model_dict['gaussianNB'].predict(xtest),
    'multinomialNB': model_dict['multinomialNB'].predict(xtest),
    'bernoulliNB': model_dict['bernoulliNB'].predict(xtest)
}

for i in scores:
    print(i, '->', r2_score(ytest, scores[i]))

print(r2_score(ytrain, model_dict['decision-tree'].predict(xtrain)))
print(r2_score(ytrain, model_dict['random-forest'].predict(xtrain)))
print(r2_score(ytrain, model_dict['boosting-algo'].predict(xtrain)))

regresion-logistica -> 0.4666666666666667
decision-tree -> 1.0
random-forest -> 1.0
boosting-algo -> 1.0
gaussianNB -> -0.06666666666666665
multinomialNB -> 0.4666666666666667
bernoulliNB -> -0.06666666666666665
1.0
1.0
1.0
