In [36]:
import graphviz
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn

###sklearn
from sklearn import (
    ensemble,
    feature_selection,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    tree,
    pipeline
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder, OneHotEncoder
from sklearn.tree import export_graphviz
from sklearn_evaluation import plot

# from pandas_profiling import ProfileReport


sns.set(rc={"axes.facecolor": "white", "figure.facecolor": "white"})

In [2]:
sylhet_raw = pd.read_csv("../data/sylhet-dataset.csv")
sylhet_copy = sylhet_raw.copy(deep=True)
col_names = [x.lower().replace(" ", "_") for x in sylhet_copy.columns]
sylhet_copy.set_axis(col_names, axis=1, inplace=True)

age_binned = pd.qcut(sylhet_copy["age"], q=6)
sylhet_copy["age"] = age_binned

y = sylhet_copy["class"].apply(lambda z: 0 if z == "Negative" else 1)
# y = y.to_numpy()

In [3]:
sylhet_copy.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,"(35.0, 42.0]",Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,"(54.0, 60.0]",Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,"(35.0, 42.0]",Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,"(42.0, 47.5]",Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,"(54.0, 60.0]",Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [4]:
sylhet_copy = sylhet_copy.loc[:, :"obesity"]

## Pipeline Method

In [5]:
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(
    sylhet_copy, y, random_state=42, train_size=0.8, test_size=0.2, stratify=y
)

In [6]:
column_trans = ColumnTransformer(
    [("one_hot", OneHotEncoder(drop="if_binary"), list(sylhet_copy.columns.values),)],
    remainder="drop",
)

X = column_trans.fit_transform(sylhet_copy)

### Numpy array version of the data

In [7]:
clf1 = naive_bayes.BernoulliNB()
clf2 = naive_bayes.GaussianNB()
clf3 = linear_model.LogisticRegression(random_state=42)
clf4 = tree.DecisionTreeClassifier(random_state=42)
clf5 = ensemble.RandomForestClassifier(random_state=42)

In [8]:
param1 = {}
param2 = {}
param3 = {}
param4 = {}
param5 = {}

param1["classifier"] = [clf1]
param2["classifier"] = [clf2]
param3["classifier"] = [clf3]
param4["classifier"] = [clf4]
param5["classifier"] = [clf5]

In [9]:
# param2['classifier__n_estimators'] = [1, 25, 50, 100]
# param2['classifier__criterion'] = ['gini', 'entropy']
# param2['classifier__max_features'] = ['sqrt', 'log2']
# param2['classifier__max_depth'] = [3, 5]

In [10]:
pipeline1 = Pipeline([("column_trans", column_trans), ("classifier", clf1)])
pipeline2 = Pipeline([("column_trans", column_trans), ("classifier", clf2)])
pipeline3 = Pipeline([("column_trans", column_trans), ("classifier", clf3)])
pipeline4 = Pipeline([("column_trans", column_trans), ("classifier", clf4)])
pipeline5 = Pipeline([("column_trans", column_trans), ("classifier", clf5)])

params = [param1, param2, param3, param4, param5]

In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
grid1 = GridSearchCV(
    estimator=pipeline1, param_grid=param1, scoring="f1_macro", cv=skf
)
grid2 = GridSearchCV(
    estimator=pipeline2, param_grid=param2, scoring="f1_macro", cv=skf
)
grid3 = GridSearchCV(
    estimator=pipeline3, param_grid=param3, scoring="f1_macro", cv=skf
)
grid4 = GridSearchCV(
    estimator=pipeline4, param_grid=param4, scoring="f1_macro", cv=skf
)
grid5 = GridSearchCV(
    estimator=pipeline5, param_grid=param5, scoring="f1_macro", cv=skf
)

In [13]:
new_df = pd.DataFrame(columns=['precision','recall','fscore','support'])
grids = [grid1,grid2,grid3,grid4,grid5]
for pipe in grids:
    pipe.fit(sylhet_copy,y)
    y_pred = pipe.predict(sylhet_copy)
    class_stats = sklearn.metrics.precision_recall_fscore_support(y, y_pred, labels=[0, 1], pos_label=1)
    
    my_df = pd.DataFrame(class_stats).T

    my_df = my_df.rename(columns={0:'precision',1:'recall',2:'fscore',3:'support'})

    my_df['model'] = pipe.best_estimator_[-1].__class__.__name__
    
    new_df = pd.concat([new_df,my_df],ignore_index=True)
    cw = sklearn.metrics.precision_recall_fscore_support(y, y_pred, labels=[0, 1], average='binary')

In [14]:
grid1.__class__.__name__

'GridSearchCV'

In [15]:
pipe.best_estimator_[-1].__class__.__name__

'RandomForestClassifier'

In [16]:
class_stats

(array([1.        , 0.99071207]),
 array([0.985, 1.   ]),
 array([0.99244332, 0.99533437]),
 array([200, 320]))

In [17]:
my_df = pd.DataFrame(class_stats).T

my_df = my_df.rename(columns={0:'precision',1:'recall',2:'fscore',3:'support'})

my_df['model'] = pipe.best_estimator_[-1].__class__.__name__

my_df

Unnamed: 0,precision,recall,fscore,support,model
0,1.0,0.985,0.992443,200.0,RandomForestClassifier
1,0.990712,1.0,0.995334,320.0,RandomForestClassifier


In [18]:
new_df

Unnamed: 0,precision,recall,fscore,support,model
0,0.796537,0.92,0.853828,200.0,BernoulliNB
1,0.944637,0.853125,0.896552,320.0,BernoulliNB
2,0.818182,0.855,0.836186,200.0,GaussianNB
3,0.906752,0.88125,0.893819,320.0,GaussianNB
4,0.912195,0.935,0.923457,200.0,LogisticRegression
5,0.95873,0.94375,0.951181,320.0,LogisticRegression
6,1.0,0.985,0.992443,200.0,DecisionTreeClassifier
7,0.990712,1.0,0.995334,320.0,DecisionTreeClassifier
8,1.0,0.985,0.992443,200.0,RandomForestClassifier
9,0.990712,1.0,0.995334,320.0,RandomForestClassifier


In [19]:
# results_conf2 = pd.DataFrame(
#     index=["negative", "positive", "model"],
#     columns=["precision", "recall", "f1", "support"],
# )

# model_compare.loc[row_index, "Precision"] = prec
# model_compare.loc[row_index, "Recall"] = recall
# model_compare.loc[row_index, "F1"] = f1

NameError: name 'prec' is not defined

In [20]:
columns = ['negative','positive']

pd.DataFrame(class_stats).T

Unnamed: 0,0,1,2,3
0,1.0,0.985,0.992443,200.0
1,0.990712,1.0,0.995334,320.0


In [21]:
pd.DataFrame(class_stats).T

Unnamed: 0,0,1,2,3
0,1.0,0.985,0.992443,200.0
1,0.990712,1.0,0.995334,320.0


In [22]:
grid_dict = {0: 'BNB', 1: 'GNB', 
             2: 'LR', 3: 'DT', 
             4: 'RF'}

In [23]:
for i, model in enumerate(grids):
    print('{} Test Accuracy: {}'.format(grid_dict[i],
    model.score(sylhet_copy,y)))
    print('{} Best Params: {}'.format(grid_dict[i],          model.best_params_))

BNB Test Accuracy: 0.8751900152012161
BNB Best Params: {'classifier': BernoulliNB()}
GNB Test Accuracy: 0.8650025767303811
GNB Best Params: {'classifier': GaussianNB()}
LR Test Accuracy: 0.9373189462428309
LR Best Params: {'classifier': LogisticRegression(random_state=42)}
DT Test Accuracy: 0.9938888475384984
DT Best Params: {'classifier': DecisionTreeClassifier(random_state=42)}
RF Test Accuracy: 0.9938888475384984
RF Best Params: {'classifier': RandomForestClassifier(random_state=42)}


In [24]:
%%time
# Train the grid search model
gs = GridSearchCV(
    model,
    param_grid=,
    cv=skf,
    n_jobs=-1,
    scoring="f1_macro",
    return_train_score=False,
).fit(X, y)

TypeError: 'NoneType' object is not iterable

In [None]:
y_pred = gs.best_estimator_.predict(sylhet_copy)

In [None]:
class_stats = sklearn.metrics.precision_recall_fscore_support(
    y, y_pred, labels=[0, 1], pos_label=1
)

In [None]:
scoring = ["accuracy", "precision", "recall", "f1_macro"]
scores = model_selection.cross_validate(
    clf1, X, y, scoring=scoring, cv=skf, return_train_score=True, return_estimator=False
)

In [25]:
from sklearn.model_selection import cross_val_score

### Train-test split
80/20

In [37]:
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(
    X, y, random_state=42, train_size=0.8, test_size=0.2
)

In [38]:
np.bincount(train1_y), np.bincount(test1_y)

(array([167, 249]), array([33, 71]))

In [39]:
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(
    X, y, random_state=42, train_size=0.8, test_size=0.2, stratify=y
)

In [40]:
np.bincount(train1_y), np.bincount(test1_y)
np.bincount(train1_y), np.bincount(test1_y)

(array([160, 256]), array([40, 64]))

In [41]:
cv_split = model_selection.ShuffleSplit(
    n_splits=5, train_size=0.8, test_size=0.2, random_state=42
)

In [42]:
for train, test in cv_split.split(X, y):
    print(
        "train -  {}   |   test -  {}".format(
            np.bincount(y[train]), np.bincount(y[test])
        )
    )

train -  [167 249]   |   test -  [33 71]
train -  [155 261]   |   test -  [45 59]
train -  [159 257]   |   test -  [41 63]
train -  [158 258]   |   test -  [42 62]
train -  [167 249]   |   test -  [33 71]


In [43]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [44]:
for train, test in skf.split(X, y):
    print(
        "train -  {}   |   test -  {}".format(
            np.bincount(y[train]), np.bincount(y[test])
        )
    )

train -  [160 256]   |   test -  [40 64]
train -  [160 256]   |   test -  [40 64]
train -  [160 256]   |   test -  [40 64]
train -  [160 256]   |   test -  [40 64]
train -  [160 256]   |   test -  [40 64]


In [60]:
%%time
# Train the grid search model
gs = GridSearchCV(
    pipeline, params, cv=skf, n_jobs=-1, scoring="recall", return_train_score=False,
).fit(sylhet_copy, y)

TypeError: estimator should be an estimator implementing 'fit' method, <module 'sklearn.pipeline' from '/Users/c92680/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py'> was passed

In [46]:
gs.estimator

NameError: name 'gs' is not defined

In [47]:
gs.cv_results_.keys()

NameError: name 'gs' is not defined

In [None]:
grid_scores = gs.cv_results_

In [None]:
pd.DataFrame(grid_scores)

In [None]:
grid_scores

In [61]:
my_models = [clf1, clf2, clf3, clf4, clf5]

In [62]:
[alg.__class__.__name__ for alg in my_models]

['BernoulliNB',
 'GaussianNB',
 'LogisticRegression',
 'DecisionTreeClassifier',
 'RandomForestClassifier']

In [63]:
grid_scores["mean_train_score"]
grid_scores["mean_test_score"]

NameError: name 'grid_scores' is not defined

In [64]:
for train, test in cv_split.split(X, y):
    print(
        "train -  {}   |   test -  {}".format(
            np.bincount(y[train]), np.bincount(y[test])
        )
    )

train -  [179 289]   |   test -  [21 31]
train -  [178 290]   |   test -  [22 30]
train -  [185 283]   |   test -  [15 37]
train -  [179 289]   |   test -  [21 31]
train -  [180 288]   |   test -  [20 32]
train -  [186 282]   |   test -  [14 38]
train -  [180 288]   |   test -  [20 32]
train -  [181 287]   |   test -  [19 33]
train -  [187 281]   |   test -  [13 39]
train -  [174 294]   |   test -  [26 26]


In [48]:
sklearn.metrics.precision_recall_fscore_support(y, y_pred, average="binary")

(0.9907120743034056, 1.0, 0.995334370139969, None)

In [65]:
%%time
# Train the grid search model
gs = GridSearchCV(
    pipeline,
    params,
    cv=cv_split,
    n_jobs=-1,
    scoring="f1_macro",
    return_train_score=True,
).fit(sylhet_copy, y)

TypeError: estimator should be an estimator implementing 'fit' method, <module 'sklearn.pipeline' from '/Users/c92680/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py'> was passed

In [66]:
column_trans = ColumnTransformer(
    [("one_hot", OneHotEncoder(drop="if_binary"), list(sylhet_copy.columns.values),)],
    remainder="drop",
)

X = column_trans.fit_transform(sylhet_copy)

In [67]:
clf1 = naive_bayes.BernoulliNB()
clf2 = naive_bayes.GaussianNB()
clf3 = linear_model.LogisticRegression()
clf4 = tree.DecisionTreeClassifier(random_state=42)
clf5 = ensemble.RandomForestClassifier(random_state=42)

In [68]:
my_models = [clf1, clf2, clf3, clf4, clf5]

In [69]:
def getScores(estimator, X, y):
    yPred = estimator.predict(X)
    prec, recall, f1, support = sklearn.metrics.precision_recall_fscore_support(
        y, y_pred, average="binary"
    )


from sklearn.datasets import load_iris
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

cv_split = model_selection.ShuffleSplit(n_splits=10, random_state=42)

# scoring = {'acc': 'accuracy',
#            'prec_macro': 'precision_macro',
#            'rec_micro': 'recall_macro',
#            'f1': 'f1_macro'
#           }

scoring = ["accuracy", "precision", "recall", "f1_macro"]
scores = cross_validate(
    clf1, X, y, scoring=scoring, cv=cv_split, return_train_score=False
)

scoring = ["accuracy", "precision", "recall", "f1_macro"]
scores = cross_validate(clf1, X, y, scoring=scoring, cv=5, return_train_score=True)

In [70]:
score_cols = [
    "Model Name",
    "Precision",
    "Recall",
    "F1-macro",
    "F1-weighted",
    "Train Accuracy",
    "Test Accuracy",
    "CV-Method",
]
model_compare = pd.DataFrame(columns=score_cols)

In [71]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [72]:
params

[{'classifier': [BernoulliNB()]},
 {'classifier': [GaussianNB()]},
 {'classifier': [LogisticRegression(random_state=42)]},
 {'classifier': [DecisionTreeClassifier(random_state=42)]},
 {'classifier': [RandomForestClassifier(random_state=42)]}]

In [75]:
row_index = 0
for alg in my_models:
    MLA_name = alg.__class__.__name__
    model_compare.loc[row_index, "Model Name"] = MLA_name
    model_compare.loc[row_index, "Model Parameters"] = str(alg.get_params())
    scoring = ["accuracy", "precision", "recall", "f1_macro", "f1_weighted"]
    # scores = cross_validate(alg, X,y, scoring=scoring, cv=skf, return_train_score=True)

    gs = GridSearchCV(
        pipeline,
        param_grid=params[3],
        cv=skf,
        n_jobs=-1,
        scoring="f1_macro",
        return_train_score=False,
    ).fit(sylhet_copy, y)

    scores2 = pd.DataFrame(scores).mean()
    model_compare.loc[row_index, "Precision"] = scores2["test_precision"]
    model_compare.loc[row_index, "Recall"] = scores2["test_recall"]
    model_compare.loc[row_index, "F1-macro"] = scores2["test_f1_macro"]
    model_compare.loc[row_index, "F1-weighted"] = scores2["test_f1_weighted"]
    model_compare.loc[row_index, "Train Accuracy"] = scores2["train_accuracy"]
    model_compare.loc[row_index, "Test Accuracy"] = scores2["test_accuracy"]
    model_compare.loc[row_index, "CV-Method"] = "StratifiedKFold"

    # class_stats = sklearn.metrics.precision_recall_fscore_support(
    # y, y_pred, labels=[0, 1], pos_label=1)
    row_index += 1

TypeError: estimator should be an estimator implementing 'fit' method, <module 'sklearn.pipeline' from '/Users/c92680/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py'> was passed

In [None]:
model_compare

In [None]:
for train, test in skf.split(X, y):
    print(
        "train -  {}   |   test -  {}".format(
            np.bincount(y[train]), np.bincount(y[test])
        )
    )

In [None]:
cv_split = model_selection.ShuffleSplit(
    n_splits=10, random_state=42, train_size=0.8, test_size=0.2
)

In [None]:
for train, test in cv_split.split(X, y):
    print(
        "train -  {}   |   test -  {}".format(
            np.bincount(y[train]), np.bincount(y[test])
        )
    )

In [76]:
kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)

In [77]:
for alg in my_models:
    MLA_name = alg.__class__.__name__
    model_compare.loc[row_index, "Model Name"] = MLA_name
    model_compare.loc[row_index, "Model Parameters"] = str(alg.get_params())
    scoring = ["accuracy", "precision", "recall", "f1_macro", "f1_weighted"]
    scores = cross_validate(
        alg, X, y, scoring=scoring, cv=kfold, return_train_score=True
    )
    scores2 = pd.DataFrame(scores).mean()
    model_compare.loc[row_index, "Precision"] = scores2["test_precision"]
    model_compare.loc[row_index, "Recall"] = scores2["test_recall"]
    model_compare.loc[row_index, "F1-macro"] = scores2["test_f1_macro"]
    model_compare.loc[row_index, "F1-weighted"] = scores2["test_f1_weighted"]
    model_compare.loc[row_index, "Train Accuracy"] = scores2["train_accuracy"]
    model_compare.loc[row_index, "Test Accuracy"] = scores2["test_accuracy"]
    model_compare.loc[row_index, "CV-Method"] = "KFold"
    row_index += 1

In [78]:
model_compare.sort_values("Model Name")

Unnamed: 0,Model Name,Precision,Recall,F1-macro,F1-weighted,Train Accuracy,Test Accuracy,CV-Method,Model Parameters
0,BernoulliNB,0.938271,0.852023,0.866484,0.875056,0.878846,0.873077,KFold,"{'alpha': 1.0, 'binarize': 0.0, 'class_prior':..."
3,DecisionTreeClassifier,0.962196,0.962824,0.950518,0.953904,0.994231,0.953846,KFold,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit..."
1,GaussianNB,0.90011,0.885403,0.861522,0.869904,0.881838,0.869231,KFold,"{'priors': None, 'var_smoothing': 1e-09}"
2,LogisticRegression,0.945194,0.920816,0.914172,0.919536,0.945085,0.919231,KFold,"{'C': 1.0, 'class_weight': None, 'dual': False..."
4,RandomForestClassifier,0.974924,0.987336,0.975087,0.976886,0.994231,0.976923,KFold,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."


In [79]:
param = model_compare.loc[model_compare["Model Name"] == "DecisionTreeClassifier"][
    "Model Parameters"
]

In [80]:
dict(param)

{3: "{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}"}

In [81]:
model = clf1.fit(X, y)

In [82]:
sklearn.metrics.precision_recall_fscore_support(
    y, model.predict(X), labels=[0, 1], pos_label=1
)

(array([0.7965368 , 0.94463668]),
 array([0.92    , 0.853125]),
 array([0.85382831, 0.89655172]),
 array([200, 320]))

In [83]:
sklearn.metrics.precision_recall_fscore_support(y, model.predict(X), average="binary")

(0.9446366782006921, 0.853125, 0.896551724137931, None)

In [None]:
def getScores(estimator, x, y):
    yPred = estimator.predict(x)
    return (
        accuracy_score(y, yPred),
        precision_score(y, yPred, pos_label=3, average="macro"),
        recall_score(y, yPred, pos_label=3, average="macro"),
    )

In [None]:
def my_scorer(estimator, x, y):
    a, p, r = getScores(estimator, x, y)
    print a, p, r
    return a + p + r


for model, name in zip(models, names):
    print name
    start = time.time()
    m = cross_val_score(model, iris.data, iris.target, scoring=my_scorer, cv=10).mean()
    print "\nSum:", m, "\n\n"
    print "time", time.time() - start, "\n\n"

In [None]:
prec, recall, f1, support = sklearn.metrics.precision_recall_fscore_support(
    y, y_pred, average="binary"
)

In [None]:
gs.cv_results_["params"]

In [None]:
gs.cv_results_["mean_train_score"]

In [None]:
(
    gs.cv_results_["split0_train_score"]
    + gs.cv_results_["split1_train_score"]
    + gs.cv_results_["split2_train_score"]
) / 3

In [None]:
gs.cv_results_["mean_train_score"]
gs.cv_results_["mean_test_score"]

In [None]:
gs.classes_

In [None]:
gs.get_params()

In [None]:
cv_split = model_selection.ShuffleSplit(
    n_splits=5, train_size=0.8, test_size=0.2, random_state=42
)

In [None]:
for train, test in cv_split.split(X, y):
    print(
        "train -  {}   |   test -  {}".format(
            np.bincount(y[train]), np.bincount(y[test])
        )
    )
for train, test in cv_split.split(X, y):
    print(
        "train -  {t1} |   test - {t2} ".format(
            t1=np.round(np.bincount(y[train]) / np.bincount(y[train]).sum(), 2),
            t2=np.round(np.bincount(y[test]) / np.bincount(y[test]).sum(), 2),
        )
    )

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
for train, test in skf.split(X, y):
    print(
        "train -  {}   |   test -  {}".format(
            np.bincount(y[train]), np.bincount(y[test])
        )
    )

In [None]:
%%time
# Train the grid search model
gs = GridSearchCV(
    pipeline, params, cv=skf, n_jobs=-1, scoring="f1_macro", return_train_score=True,
).fit(sylhet_copy, y)

In [None]:
gs.cv_results_

In [None]:
cv_results["train_score"]

In [None]:
gs.cv_results_

In [None]:
# Best performing model and its corresponding hyperparameters
gs.best_params_

In [None]:
gs.best_score_

In [None]:
gs.cv_results_

In [None]:
gs.score(test1_x, test1_y)

In [None]:
# Test data performance
print("Test Precision:", precision_score(gs.predict(X_test), y_test))
print("Test Recall:", recall_score(gs.predict(X_test), y_test))
print("Test ROC AUC Score:", roc_auc_score(gs.predict(X_test), y_test))

In [None]:
dt = Pipeline([("column_trans", column_trans),])

In [None]:
search = GridSearchCV

In [None]:
dt.fit(train1_x, train1_y)

In [None]:
pipeline = Pipeline([("classifier", clf1)])
params = [param1, param2]

In [None]:
dt2 = Pipeline(
    [
        ("column_trans", column_trans),
        (
            "classifier",
            tree.DecisionTreeClassifier(
                random_state=42, max_depth=3, min_samples_leaf=10, min_samples_split=10
            ),
        ),
    ]
)

dt2.fit(X_train, y_train)

feature_names = dt2[:-1].get_feature_names_out()
mdi_importances2 = pd.Series(
    dt2[-1].feature_importances_, index=feature_names
).sort_values(ascending=True)

In [None]:
# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {
    "classifier__max_depth": [1, 3, 5],
    "classifier__min_samples_leaf": [5, 10, 15, 20],
    "classifier__min_samples_split": [5, 10, 15, 20],
}
search = GridSearchCV(dt, param_grid, n_jobs=2)
search.fit(sylhet_copy, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

## Testing age variable

In [None]:
age_binned = pd.qcut(sylhet_copy["age"], q=6)
sylhet_copy["age_binned"] = age_binned
sylhet_copy["age"] = sylhet_copy["age"].astype("float")

In [None]:
print(list(sylhet_copy.columns.values)[1:])
column_trans = ColumnTransformer(
    [
        (
            "one_hot",
            OneHotEncoder(drop="if_binary"),
            list(sylhet_copy.columns.values)[1:],
        )
    ],
    remainder="drop",
)
column_trans.fit(sylhet_copy)

In [None]:
column_trans.get_feature_names_out()

In [None]:
X = column_trans.transform(sylhet_copy)

In [None]:
print(list(sylhet_copy.columns.values)[1:])
column_trans_c = ColumnTransformer(
    [
        (
            "one_hot",
            OneHotEncoder(drop="if_binary"),
            list(sylhet_copy.columns.values)[1:-1],
        )
    ],
    remainder="passthrough",
)
column_trans_c.fit(sylhet_copy)
column_trans_c.get_feature_names_out()

In [None]:
X_c = column_trans_c.transform(sylhet_copy)

In [None]:
X_c[0]

In [None]:
X_c = X_c[:, :-1]

In [None]:
X_c[0]

In [None]:
MLA = [
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    linear_model.LogisticRegression(),
    tree.DecisionTreeClassifier(),
    ensemble.RandomForestClassifier(),
]

In [None]:
model_compare = pd.DataFrame(columns=score_cols)

results_conf = pd.DataFrame(
    columns=[
        "true_negative",
        "false_positive",
        "false_negative",
        "true_positive",
        "model",
        "experiment",
    ]
)
results = pd.DataFrame(
    index=["negative", "positive", "model"],
    columns=["precision", "recall", "fbeta", "support", "experiment"],
)

for X in [X, X_c]:
    train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(
        X, y, random_state=42, train_size=0.8, test_size=0.2, stratify=y
    )

    score_cols = [
        "Model Name",
        "Precision",
        "Recall",
        "F1",
        "Train Accuracy",
        "Test Accuracy",
    ]

    row_index = 0
    for alg in MLA:
        print(alg.__class__.__name__)
        MLA_name = alg.__class__.__name__
        model = alg.fit(train1_x, train1_y)
        y_pred = model.predict(test1_x)
        print(sklearn.metrics.confusion_matrix(test1_y, y_pred), "\n")
        flattened = sklearn.metrics.confusion_matrix(test1_y, y_pred).ravel()
        confusion = pd.DataFrame(
            [flattened],
            columns=[
                "true_negative",
                "false_positive",
                "false_negative",
                "true_positive",
            ],
        )
        confusion["model"] = MLA_name
        confusion["experiment"] = row_index
        results_conf = pd.concat([results_conf, confusion])
        prec, recall, f1, support = sklearn.metrics.precision_recall_fscore_support(
            test1_y, y_pred, average="binary"
        )
        class_stats = sklearn.metrics.precision_recall_fscore_support(
            test1_y, y_pred, labels=[0, 1], pos_label=1
        )
        class_stats = np.array(class_stats).T
        stats = pd.DataFrame(
            class_stats,
            index=["negative", "positive"],
            columns=["precision", "recall", "fbeta", "support"],
        )
        stats["model"] = MLA_name
        stats["experiment"] = row_index
        results = pd.concat([results, stats], ignore_index=False)
        model_compare.loc[row_index, "Model Name"] = MLA_name
        model_compare.loc[row_index, "Precision"] = prec
        model_compare.loc[row_index, "Recall"] = recall
        model_compare.loc[row_index, "F1"] = f1
        model_compare.loc[row_index, "Train Accuracy"] = model.score(train1_x, train1_y)
        model_compare.loc[row_index, "Test Accuracy"] = model.score(test1_x, test1_y)

        row_index += 1
    results = results.dropna()
    results_conf = results_conf.dropna()
    print(model_compare)

In [None]:
results

In [None]:
# result_conf

In [None]:
model_compare