In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, accuracy_score

In [2]:
df = pd.read_csv('../data_clean/AA_Supervised_dataset.csv')
df.head()

Unnamed: 0,afspraak_Arbeidsmarkt,afspraak_Bedrijfsbeheer,afspraak_Duurzaamheid,afspraak_Familiebedrijfsmanagement,afspraak_Financieel,afspraak_Groeien en Netwerking,afspraak_Internationaal Ondernemen,afspraak_Lidmaatschap,afspraak_Logistiek en Transport,afspraak_Plato & Bryo,...,4,5,0_campagne_type,1_campagne_type,2_campagne_type,3_campagne_type,4_campagne_type,5_campagne_type,Online,Offline
0,0,0,0,0,0,0,0,1,0,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
2,0,0,0,0,0,0,0,1,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1
4,0,0,0,0,0,0,0,1,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 54 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   afspraak_Arbeidsmarkt               300000 non-null  int64  
 1   afspraak_Bedrijfsbeheer             300000 non-null  int64  
 2   afspraak_Duurzaamheid               300000 non-null  int64  
 3   afspraak_Familiebedrijfsmanagement  300000 non-null  int64  
 4   afspraak_Financieel                 300000 non-null  int64  
 5   afspraak_Groeien en Netwerking      300000 non-null  int64  
 6   afspraak_Internationaal Ondernemen  300000 non-null  int64  
 7   afspraak_Lidmaatschap               300000 non-null  int64  
 8   afspraak_Logistiek en Transport     300000 non-null  int64  
 9   afspraak_Plato & Bryo               300000 non-null  int64  
 10  afspraak_Technologie en Innovatie   300000 non-null  int64  
 11  afspraak_Welt             

In [5]:
# opsplitting in X en y
X = df.drop('ingeschreven', axis=1)
y = df['ingeschreven']

In [7]:
# opsplitting in train en test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=df['ingeschreven'])
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((210000, 53), (90000, 53), (210000,), (90000,))

In [8]:
best_models = []

def models_gs(model):
    try:
        grid_search = GridSearchCV(model["model"], param_grid=model["param_grid"], cv=5, scoring="accuracy", return_train_score=True, n_jobs=-1)
        grid_search.fit(X_train, y_train)
    
        best_model = grid_search.best_estimator_.named_steps[model["name"].lower()]
        return (best_model, grid_search.best_score_)
    
    except Exception as e:
        print(f"{model['name']} failed with error: {e}")

In [9]:
non_ensemble_models_param_grid = [
   {
      "name":"linearsvc",
      "model": LinearSVC(random_state=42),
      "param_grid": {
         "linearsvc__C":[0.1, 0.5, 1, 2.5, 5, 10, 20, 50, 75, 100, 125, 150, 200, 250, 500, 1000],
         "linearsvc__penalty": ["l2"],
         "linearsvc__loss": ["loss", "log_loss", "hinge", "squared_hinge"],
      }
   },
   {
      "name":"sgdclassifier",
      "model": SGDClassifier(loss="log", random_state=42), # loss="log" because we want to use the predict_proba method, this is not possible with hinge loss
      "param_grid":{
         "sgdclassifier__alpha":[0.001, 0.01, 0.1, 0.5, 1],
         "sgdclassifier__penalty":["l2"],
         "sgdclassifier__max_iter": [1000]
      }
   },
   {
      "name":"logisticregression",
      "model": LogisticRegression(random_state=42, max_iter=1000),
      "param_grid":{
         "logisticregression__C":[0.1, 0.5, 1, 2.5, 5, 10, 20, 50, 75, 100, 125, 150, 200, 250, 500, 1000],
         "logisticregression__penalty":["l2"]
      }
   },
   {
      "name":"decisiontreeclassifier",
      "model": DecisionTreeClassifier(random_state=42),
      "param_grid":{
         "decisiontreeclassifier__max_depth":[5, 10, 15, 20, 25, 30],
         "decisiontreeclassifier__min_samples_split":[2, 5, 10, 15],
         "decisiontreeclassifier__min_samples_leaf":[2, 5, 10, 15],
      }
   },
]

In [10]:
for model in non_ensemble_models_param_grid:
  print(f'Starting {model["name"]}...')
  gs = models_gs(model)
  print(gs)
  best_models.append({
      "name": model["name"],
      "model": gs[0],
      "gs_score": gs[1],
  })

Starting linearsvc...
linearsvc failed with error: Invalid parameter 'linearsvc' for estimator LinearSVC(random_state=42). Valid parameters are: ['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'].
None


TypeError: 'NoneType' object is not subscriptable

In [None]:
for model in best_models:
  cv_score = cross_val_score(model["model"], X_train, y_train, cv=5, scoring="accuracy", n_jobs=-1)
  print(f'{model["name"]} - cross validation scores: {np.mean(cv_score)}')

In [None]:
rfc = {
      "name":"randomforestclassifier",
      "model": RandomForestClassifier(random_state=42, n_jobs=-1),
      "param_grid":{
         "randomforestclassifier__n_estimators":[50, 75, 100, 125, 150, 175, 200],
         "randomforestclassifier__min_samples_split":[2, 5, 10],
         "randomforestclassifier__min_samples_leaf":[2, 5, 10],
      }
    }

In [None]:
print(f"Starting {rfc['name']}...")
gs = models_gs(rfc)
print(gs)
best_models.append({
    "name": f"{rfc['name']}",
    "model": gs[0],
    "gs_score": gs[1],
})

In [None]:
voting_clfs = [
    {
        "name": "votingclassifier",
        "diff": "hard",
        "model": VotingClassifier(estimators=[
            ('linsvc', best_models[0]['model']),
            ('sgd', best_models[1]['model']),
            ('lr', best_models[2]['model']),
            ('dt', best_models[3]['model']),
            ('rf', best_models[4]['model']),
        ], voting='hard', n_jobs=-1),
        "param_grid": {
            'votingclassifier__weights': [
                [1, 1, 1, 1, 1],
                [1, 1, 1, 1, 2], 
                [1, 1, 1, 2, 2], 
                [1, 1, 2, 2, 2], 
                [1, 2, 2, 2, 2],
                [2, 2, 2, 2, 2]
            ]
        }
    },
    {
        "name": "votingclassifier",
        "diff": "soft",
        "model": VotingClassifier(estimators=[
            ('sgd', best_models[1]['model']),
            ('lr', best_models[2]['model']),
            ('dt', best_models[3]['model']),
            ('rf', best_models[4]['model']),
        ], voting='soft', n_jobs=-1),
        "param_grid": {
            'votingclassifier__weights': [
                [1, 1, 1, 1], 
                [1, 1, 1, 2], 
                [1, 1, 2, 2], 
                [1, 2, 2, 2], 
                [2, 2, 2, 2]
            ]
        }
    }
]

In [None]:
for model in voting_clfs:
    print(f"Starting {model['diff']}-voting model...")
    gs = models_gs({
        "name": model["name"],
        "model": model["model"],
        "param_grid": model["param_grid"]
    })
    print(gs)
    best_models.append({
        "name": f"voting_{model['diff']}",
        "model": gs[0],
        "gs_score": gs[1],
    })

In [None]:
bagging_clf = {
    "name": 'BaggingClassifier',
    "model": BaggingClassifier(random_state=42, n_jobs=-1, oob_score=True),
    "param_grid": {
        "baggingclassifier__estimator": [best_models[3]["model"]],
        "baggingclassifier__n_estimators": [10, 50, 100, 150],
        "baggingclassifier__bootstrap": [True, False],
        "baggingclassifier__max_samples": [0.5, 1.0, 2.0, 5.0]
    }
}

In [None]:
print(f"Starting {bagging_clf['name']}...")
gs = models_gs(bagging_clf)
print(gs)
print(f"OOB score: {gs[0].oob_score_}")
best_models.append({
    "name": f"{bagging_clf['name']}",
    "model": gs[0],
    "gs_score": gs[1],
})

In [None]:
adaboost_clf = {
    "name": 'AdaBoostClassifier',
    "model": AdaBoostClassifier(random_state=42),
    "param_grid": {
        "adaboostclassifier__base_estimator": [best_models[3]["model"]],
        "adaboostclassifier__n_estimators": [10, 50, 100, 150],
        "adaboostclassifier__learning_rate": [0.1, 0.5, 1.0, 2.0],
        "adaboostclassifier__algorithm": ['SAMME.R']
    }
}

In [None]:
print(f"Starting {adaboost_clf['name']}...")
gs = models_gs(adaboost_clf)
print(gs)
best_models.append({
    "name": f"{adaboost_clf['name']}",
    "model": gs[0],
    "gs_score": gs[1],
})

In [None]:
gradient_boosting_clf = {
    "name": 'GradientBoostingClassifier',
    "model": GradientBoostingClassifier(random_state=42),
    "param_grid": {
        "gradientboostingclassifier__n_estimators": [10, 50, 100, 150],
        "gradientboostingclassifier__learning_rate": [0.1, 0.5, 1.0, 2.0],
        "gradientboostingclassifier__min_samples_split": [2, 5, 10],
    }
}

In [None]:
print(f"Starting {gradient_boosting_clf['name']}...")
gs = models_gs(gradient_boosting_clf)
print(gs)
best_models.append({
    "name": f"{gradient_boosting_clf['name']}",
    "model": gs[0],
    "gs_score": gs[1],
})

In [None]:
estimators = [
    ('sgd', best_models[1]['model']),
    ('lr', best_models[2]['model']),
    ('dt', best_models[3]['model']),
    ('rf', best_models[4]['model']),
]

final_estimators = [
    ('sgd', SGDClassifier(random_state=42, loss='log')),
    ('lr', LogisticRegression(random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)),
]

stacking_clfs = [
    {
        "name": 'StackingClassifier',
        "diff": final_estimators[0][0],
        "model": StackingClassifier(estimators=estimators, final_estimator=final_estimators[0][1], n_jobs=-1),
        "param_grid": {}
    },
    {
        "name": 'StackingClassifier',
        "diff": final_estimators[1][0],
        "model": StackingClassifier(estimators=estimators, final_estimator=final_estimators[1][1], n_jobs=-1),
        "param_grid": {}
    },
    {
        "name": 'StackingClassifier',
        "diff": final_estimators[2][0],
        "model": StackingClassifier(estimators=estimators, final_estimator=final_estimators[2][1], n_jobs=-1),
        "param_grid": {}
    },
    {
        "name": 'StackingClassifier',
        "diff": final_estimators[3][0],
        "model": StackingClassifier(estimators=estimators, final_estimator=final_estimators[3][1], n_jobs=-1),
        "param_grid": {}
    }
]

In [None]:
for i in stacking_clfs:
    print(f"Starting {i['name']}_{i['diff']}...")
    gs = models_gs(i)
    print(gs)
    best_models.append({
        "name": f"{i['name']}_{i['diff']}",
        "model": gs[0],
        "gs_score": gs[1],
    })

## Dataframe for later comparison

In [None]:
model_comparison = pd.DataFrame(columns=["Name", "gs_score", "cv_score", "precision", "recall", "f1", "roc_auc"])

In [None]:
for model in best_models:
    scores = cross_val_score(model["model"], X_train, y_train, cv=5, scoring="accuracy", n_jobs=-1)
    y_train_pred = cross_val_predict(model["model"], X_train, y_train, cv=5, n_jobs=-1)

    precision = precision_score(y_train, y_train_pred)
    recall = recall_score(y_train, y_train_pred)
    f1 = f1_score(y_train, y_train_pred)
    roc_auc = roc_auc_score(y_train, y_train_pred)

    model_comparison = model_comparison.append({
        "Name": model["name"],
        "gs_score": model["gs_score"],
        "cv_score": scores.mean(),
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }, ignore_index=True)

In [None]:
model_comparison

In [None]:
model_comparison.sort_values(by="precision", ascending=False)

In [None]:
model_comparison_test = pd.DataFrame(columns=["Name", "accuracy", "precision", "recall", "f1", "roc_auc"])

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
  plt.figure(figsize=(6, 5))
  plt.plot(fpr, tpr, linewidth=2, label=label)
  plt.plot([0, 1], [0, 1], 'k--', label="Random classifier's ROC curve")
  plt.title(f"{label} - Test")
  plt.legend(loc="lower right", fontsize=13)
  plt.xlabel("True Positive Rate (Recall)")
  plt.ylabel("False Positive Rate (Fall-Out)")
  plt.grid(True)
  plt.axis([0, 1, 0, 1])
  plt.figure(figsize=(10,5))

In [None]:
for model in best_models:
    X_test_scaled = col_transformer.fit_transform(X_test)
    y_test_pred = model["model"].predict(X_test_scaled)

    plt.figure()
    ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred, normalize="true", values_format='.0%')
    plt.title(f"{model['name']} - Test")
    plt.show()

    score = accuracy_score(y_test, y_test_pred) 
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_test_pred)

    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
    plt.figure()
    plot_roc_curve(fpr, tpr, label=model["name"])
    plt.show()
    
    print(f"{model['name']} - Test")
    print(f"Accuracy: {score.mean()}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")
    print(f"ROC AUC: {roc_auc}")
    print(75 * "*")

    model_comparison_test = model_comparison_test.append({
        "Name": model["name"],
        "accuracy": score,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }, ignore_index=True)

In [None]:
model_comparison_test

In [None]:
# import pickle as pkl

# stacking_sgd = best_models[10]
# filename = f'models/weblogs_{stacking_sgd["name"]}.pkl'
# with open(filename, 'wb') as file:
#     pkl.dump(stacking_sgd["model"], file)