# Compare ML models
---------


### Author Information
**Author:** PJ Gibson  
**Email:** Peter.Gibson@doh.wa.gov  
**Github:**   https://github.com/DOH-PJG1303

### Project Information
**Created Date:** 2023-05-23  
**Last Updated:** 2023-05-23  
**Version:** 1  

### Description
This notebook should serve to educate newcomers to Python on comparing Machine Learning techniques in the context of Record Linkage.

### Notes

## 1. Import libs

In [None]:
# Data Analysis Libs
import pandas as pd
import numpy as np

# ML Libs / functions
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

## 2. Data Prep

### 2.1 Read in training data

Multi-index data with the following columns:
* fname
* lname
* dob
* phone
* add
* label

In [None]:
df_training = pd.read_csv('Data/synthetic_training_data.csv',index_col=[0,1])

### 2.2 Test Train Split

In [None]:
# Split the data into features (X) and labels (y)
X = df_training.drop(columns='label')
y = df_training['label']

# Split up into 80% training data, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Model Work

### 3.1 Define models and param search spaces

In [None]:
# Define models and parameters
models = [
    ('Logistic Regression', LogisticRegression(), { 'solver' : ['liblinear'] ,'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}),
    ('Naive Bayes', GaussianNB(), {}),
    ('Random Forest', RandomForestClassifier(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 10, 20]}),
    ('Gradient Boosting', GradientBoostingClassifier(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),
    ('MLP', MLPClassifier(), {'hidden_layer_sizes': [(50,), (100,)], 'activation': ['relu', 'tanh', 'logistic'], 'learning_rate': ['constant', 'invscaling', 'adaptive']})
]

### 3.2 Train, Test, Evaluate


#### NOTE: this took nearly 3 hours to run

In [None]:
# Create a dataframe to store the results
results = pd.DataFrame(columns=['model_type', 'model_parameters', 'num_true_positives', 'num_false_positives', 'num_true_negatives', 'num_false_negatives', 'accuracy', 'precision', 'recall', 'f1_score','roc_auc'])

# For each model
for name, model, params in models:

    # Perform a grid search with cross-validation
    grid = GridSearchCV(model, params, cv=5)
    grid.fit(X_train, y_train)

    # Get the best estimator
    best_model = grid.best_estimator_

    # Make predictions
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    # Append to the results dataframe
    results = results.append({
        'model_type': name,
        'model_parameters': grid.best_params_,
        'num_true_positives': tp,
        'num_false_positives': fp,
        'num_true_negatives': tn,
        'num_false_negatives': fn,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    }, ignore_index=True)

    print(f'{name} training complete!')

results

## 4. Save

In [None]:
results.to_csv('./Data/model_comparisons.csv', index=False)