In [101]:
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydotplus
import graphviz
import seaborn as sns


from sklearn.ensemble import RandomForestClassifier
from IPython.display import Image
from sklearn import tree
from six import StringIO
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_wine
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [37]:
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny', 'Rainy','Sunny','Overcast','Overcast','Rainy']

temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']

play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

In [38]:
encoder = LabelEncoder()

encoder_weather = encoder.fit_transform(weather)
encoder_temp = encoder.fit_transform(temp)
encoder_label = encoder.fit_transform(play)

print(encoder_weather)
print(encoder_temp)
print(encoder_label)

[2 2 0 1 1 1 0 2 2 1 2 0 0 1]
[1 1 1 2 0 0 0 2 0 2 2 2 1 2]
[0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [39]:
features = list(zip(encoder_weather,encoder_temp))
print(features)

[(2, 1), (2, 1), (0, 1), (1, 2), (1, 0), (1, 0), (0, 0), (2, 2), (2, 0), (1, 2), (2, 2), (0, 2), (0, 1), (1, 2)]


In [40]:
model = GaussianNB()

model.fit(features, encoder_label)

sample_prediction = model.predict([[0, 2], [1, 1]])
print("prediction :", sample_prediction)

prediction = model.predict(features)
print(classification_report(encoder_label, prediction))

prediction : [1 1]
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.78      0.78      0.78         9

    accuracy                           0.71        14
   macro avg       0.69      0.69      0.69        14
weighted avg       0.71      0.71      0.71        14



## Gaussian Naive Bayes

In [41]:
wine = load_wine()

print("feature_names:", wine.feature_names)
print("classes_names:", wine.target_names)

feature_names: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
classes_names: ['class_0' 'class_1' 'class_2']


In [42]:
wine.data.shape

(178, 13)

In [43]:
X_train, X_dev, y_train, y_dev = train_test_split(wine.data, 
                                                  wine.target, test_size=.3, random_state=109)

In [62]:
def train_model(X, y, model, test_size=.3):
    X_train, X_dev, y_train, y_dev = train_test_split(
        X, y,test_size=test_size, random_state=109
    )
    
    start = time()
    model.fit(X_train, y_train)
    end = time()
    
    
    train_prediction = model.predict(X_train)
    test_prediction = model.predict(X_dev)

    print(classification_report(y_train, train_prediction))
    print(classification_report(y_dev, test_prediction))

In [50]:
model = GaussianNB()

train_model(wine.data, wine.target, model)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      0.98      0.99        52
           2       0.97      1.00      0.99        34

    accuracy                           0.99       124
   macro avg       0.99      0.99      0.99       124
weighted avg       0.99      0.99      0.99       124

              precision    recall  f1-score   support

           0       0.91      0.95      0.93        21
           1       0.94      0.79      0.86        19
           2       0.88      1.00      0.93        14

    accuracy                           0.91        54
   macro avg       0.91      0.91      0.91        54
weighted avg       0.91      0.91      0.91        54



## Decision Tree

In [58]:
diabetes = pd.read_csv("data/diabetes.csv")
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [59]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [61]:
feat_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI', 
            'DiabetesPedigreeFunction', 'Age'] 

X = diabetes[feat_cols]
y = diabetes.Outcome

In [63]:
model = DecisionTreeClassifier(random_state=11)
train_model(X, y, model)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       351
           1       1.00      1.00      1.00       186

    accuracy                           1.00       537
   macro avg       1.00      1.00      1.00       537
weighted avg       1.00      1.00      1.00       537

              precision    recall  f1-score   support

           0       0.75      0.79      0.77       149
           1       0.58      0.54      0.56        82

    accuracy                           0.70       231
   macro avg       0.67      0.66      0.66       231
weighted avg       0.69      0.70      0.69       231



In [86]:
pip uninstall graphviz

^C
Note: you may need to restart the kernel to use updated packages.


In [87]:
!conda install python-graphviz -y

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [88]:
def visualize_tree(model, class_names):
    dot_data = StringIO()
    tree.export_graphviz(model, class_names=class_names, out_file= dot_data,
                     feature_names=feat_cols, filled=True, rounded=True, 
                     special_characters=True)

    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

    return Image(graph.create_png())

In [89]:
graph = visualize_tree(model, ["0", "1"])

InvocationException: GraphViz's executables not found

## Random Forest

In [97]:
iris = load_iris()

df_iris = pd.DataFrame({
    "sepal length": iris.data[:, 0],
    "sepal width": iris.data[:, 1],
    "petal length": iris.data[:, 2],
    "petal width": iris.data[:, 3],
    "species": iris.target
})
df_iris

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [99]:
model = RandomForestClassifier()

train_model(df_iris.drop(columns='species'), df_iris['species'], model)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00        34

    accuracy                           1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.94      0.88      0.91        17
           2       0.88      0.94      0.91        16

    accuracy                           0.93        45
   macro avg       0.94      0.94      0.94        45
weighted avg       0.93      0.93      0.93        45



In [106]:
iris_feat_importance = pd.Series(model.feature_importances_, index=iris.feature_names).sort_values(ascending=False)
iris_feat_importance

petal length (cm)    0.544669
petal width (cm)     0.339429
sepal length (cm)    0.100908
sepal width (cm)     0.014994
dtype: float64

## Support Vector Machines