In [2]:
# implementing decision trees with scikit-learn

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('/home/anshul/MMLSL/chapter08/ad.data', header=None)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
explanatory_variable_columns = set(df.columns.values)

# get the feature set collected
explanatory_variable_columns.remove(len(df.columns.values) - 1)

# get the response variable collected
response_variable_column = df[len(df.columns.values) - 1]

In [7]:
# creating X
X = df[list(explanatory_variable_columns)].copy()

## !! --- CRICITCA STEP --- !! ##
# this is how the missing values within the dataset are replaced.

X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)

y = [1 if e == 'ad.' else 0 for e in response_variable_column]

In [8]:
# split to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])
parameters = {
    'clf__max_depth': (150, 155, 160),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2, 3),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')

In [10]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    4.6s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('clf', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__max_depth': (150, 155, 160), 'clf__min_samples_split': (2, 3), 'clf__min_samples_leaf': (1, 2, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=1)

In [11]:
print('Best Score: {0:.3f}'.format(grid_search.best_score_))
print('\nBest Parameter Set: ')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{}: {}'.format(param_name, best_parameters[param_name]))

Best Score: 0.881

Best Parameter Set: 
clf__max_depth: 160
clf__min_samples_leaf: 3
clf__min_samples_split: 3


In [12]:
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.97      0.98      0.98       705
          1       0.90      0.83      0.86       115

avg / total       0.96      0.96      0.96       820

