In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from sklearn.preprocessing import scale

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from PIL import Image
from sklearn.metrics import accuracy_score

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

from matplotlib import pyplot as plt
import seaborn as sns

import graphviz
import pydotplus
import io
import imageio
from scipy import misc

%matplotlib inline

In [2]:
database = pd.read_csv("data.csv", sep=';')


In [3]:
X = database.drop(["MUSK","168"],axis=1)
y = database['168']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.10 )

# DECISION TREE

In [4]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
score= accuracy_score(y_test, y_pred) * 100
print(score)
report_dt=classification_report(y_test, y_pred)
print(report_dt)
print(confusion_matrix(y_test,y_pred))

96.96969696969697
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       551
         1.0       0.92      0.90      0.91       109

    accuracy                           0.97       660
   macro avg       0.95      0.94      0.94       660
weighted avg       0.97      0.97      0.97       660

[[542   9]
 [ 11  98]]


# LOGISTIC REGRESSION


In [5]:
logreg=LogisticRegression(solver='liblinear',multi_class='ovr')
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)
score= accuracy_score(y_test, y_pred) * 100
print(score)
report_logreg=classification_report(y_test, y_pred)
print(report_logreg)
print(confusion_matrix(y_test,y_pred))

94.54545454545455
              precision    recall  f1-score   support

         0.0       0.96      0.97      0.97       551
         1.0       0.85      0.81      0.83       109

    accuracy                           0.95       660
   macro avg       0.91      0.89      0.90       660
weighted avg       0.94      0.95      0.94       660

[[536  15]
 [ 21  88]]


# RANDOM FOREST

In [6]:
rf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
score= accuracy_score(y_test, y_pred) * 100
print(score)
report_rf=classification_report(y_test, y_pred)
print(report_rf)
print(confusion_matrix(y_test,y_pred))

97.87878787878788
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       551
         1.0       0.99      0.88      0.93       109

    accuracy                           0.98       660
   macro avg       0.98      0.94      0.96       660
weighted avg       0.98      0.98      0.98       660

[[550   1]
 [ 13  96]]


# SVM

In [7]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
score= accuracy_score(y_test, y_pred) * 100
print(score)
report_svm=classification_report(y_test, y_pred)
print(report_svm)
print(confusion_matrix(y_test,y_pred))
    

88.93939393939394
              precision    recall  f1-score   support

         0.0       0.88      1.00      0.94       551
         1.0       1.00      0.33      0.50       109

    accuracy                           0.89       660
   macro avg       0.94      0.67      0.72       660
weighted avg       0.90      0.89      0.86       660

[[551   0]
 [ 73  36]]


# CROSS VALIDATIONS / 10 - FOLD

In [8]:
from sklearn.model_selection import cross_val_score
print("DECISION_TREE:" ,cross_val_score(DecisionTreeClassifier(), X, y, cv=10, scoring ="accuracy").mean())
print("LOGISTIC_REGRESSION:" ,cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y,cv=10, scoring = "accuracy").mean())
print("SUPPORT_VECTOR_MACHINES:" ,cross_val_score(SVC(gamma='auto'), X, y, cv=10, scoring ="accuracy").mean())
print("RANDOM_FOREST:" ,cross_val_score(RandomForestClassifier(n_estimators=40), X, y, cv=10, scoring ="accuracy").mean())


DECISION_TREE: 0.7956843818755462
LOGISTIC_REGRESSION: 0.810538347160688
SUPPORT_VECTOR_MACHINES: 0.8498232889618315
RANDOM_FOREST: 0.7960277634413135
