In [3]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('../../data/diam.csv')

In [6]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo


In [7]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
target     0
dtype: int64

In [8]:
df = pd.concat([df, pd.get_dummies(df['color'], prefix='color'),
                    pd.get_dummies(df['cut'], prefix='cut'),
                    pd.get_dummies(df['clarity'], prefix="clarity")],
                     axis=1)

In [9]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target,...,cut_Premium,cut_Very Good,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo,...,0,0,0,0,0,1,0,0,0,0
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo,...,1,0,0,0,1,0,0,0,0,0
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo,...,0,0,0,0,0,0,1,0,0,0
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo,...,1,0,0,0,0,0,0,1,0,0
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo,...,0,0,0,0,0,1,0,0,0,0


In [10]:
target = {'dorogo': 1,'dewevo': 0}
df.target = [target[x] for x in df.target]

In [11]:
df.drop(['cut', 'color', 'clarity'], axis=1, inplace=True)

In [12]:
y = df['target']
X = df.drop('target', axis=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [14]:
clf = DecisionTreeClassifier(max_depth=2, random_state=17)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [15]:
prediction = clf.predict(X_test)

In [17]:
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

dot_data = StringIO()
export_graphviz(clf, feature_names=[(i) for i in X_train.columns], 
                out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\xf7\x00\x00\x01g\x08\x06\x00\x00\x00"\xf1fe\x00\…

In [18]:
accuracy_score(prediction, y_test)

0.9508713385242863

In [28]:
tree_params = {'max_depth': list(range(1, 10)), 
               'min_samples_leaf': list(range(1, 5)),}

tree_grid = GridSearchCV(clf, tree_params,
                         cv=5, n_jobs=-1,
                        verbose=True)
tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    6.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [20]:
tree_grid.best_params_

{'max_depth': 9, 'min_samples_leaf': 3}

In [21]:
tree_grid.best_score_

0.9719132369299222

In [22]:
prediction = tree_grid.predict(X_test)

In [32]:
accuracy_score(prediction, y_test)

0.9701520207638117