# Model Selection
---

In [1]:
import joblib as jl
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Compute accuracy, precision, recall and AUPRC for each model
def compute_metrics(model, data):
    y_pred = model.predict(data[:, :-1])
    y_true = data[:, -1]
    accuracy = sk.metrics.accuracy_score(y_true, y_pred)
    precision = sk.metrics.precision_score(y_true, y_pred)
    recall = sk.metrics.recall_score(y_true, y_pred)
    auprc = sk.metrics.average_precision_score(y_true, y_pred)
    return accuracy, precision, recall, auprc

### Models With Only Symptoms One Hot

In [None]:
# Load the models
path = '../prediction_model/models/'
names = ['log_reg_classsic', 'random_forest_classic', 'MLP_classic']

log_reg = jl.load(path + names[0] + '.joblib')
random_forest = jl.load(path + names[1] + '.joblib')
MLP = jl.load(path + names[2] + '.joblib')

# Load the data
path = '../prediction_model/features/'
names = ['symptoms']

symptoms = np.load(path + names[0] + '.npz')
data = np.concatenate((symptoms['X'], symptoms['y']), axis=1)

In [None]:
# create a dataframe with model names and metrics and the corresponding values
metrics = ['accuracy', 'precision', 'recall', 'AUPRC']
df = pd.DataFrame(columns=metrics)
df['model'] = names
df.set_index('model', inplace=True)

df.loc['log_reg', metrics] = compute_metrics(log_reg, data)
df.loc['random_forest', metrics] = compute_metrics(random_forest, data)
df.loc['MLP', metrics] = compute_metrics(MLP, data)

df

In [None]:
# plot the accuracies
fig, ax = plt.subplots()
ax.bar(names, df['accuracy'])
ax.set_ylabel('accuracy')
ax.set_title('Accuracy of the models')
plt.show()

# plot precision and recall, using two axis
fig, ax = plt.subplots()
ax.plot(names, df['precision'], color='red')
ax.set_ylabel('precision', color='red')
ax.tick_params(axis='y', labelcolor='red')
ax.set_title('Precision and recall of the models')
ax2 = ax.twinx()
ax2.plot(names, df['recall'], color='blue')
ax2.set_ylabel('recall', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')
plt.show()

# plot AUPRC
fig, ax = plt.subplots()
ax.bar(names, df['AUPRC'])
ax.set_ylabel('AUPRC')
ax.set_title('AUPRC of the models')
plt.show()

### Models With New Features

In [None]:
# Load the models
path = '../prediction_model/models/'
names = ['log_reg_mix', 'random_forest_mix', 'MLP_mix']

log_reg = jl.load(path + names[0] + '.joblib')
random_forest = jl.load(path + names[1] + '.joblib')
MLP = jl.load(path + names[2] + '.joblib')

# Load the data
path = '../prediction_model/features/'
names = ['betweenness', 'community_count', 'community_size']

betweenness = np.load(path + names[0] + '.npz')
community_count = np.load(path + names[1] + '.npz')
community_size = np.load(path + names[2] + '.npz')

# Associate the features 
data_log_reg = np.concatenate((betweenness['X'], community_count['X'], community_size['X'], community_count['y']), axis=1)
data_random_forest = np.concatenate((betweenness['X'], community_count['X'], community_size['X'], community_count['y']), axis=1)
data_MLP = np.concatenate((community_count['X'], community_size['X'], community_count['y']), axis=1)


In [None]:
# create a dataframe with model names and metrics and the corresponding values
metrics = ['accuracy', 'precision', 'recall', 'AUPRC']
df = pd.DataFrame(columns=metrics)
df['model'] = names
df.set_index('model', inplace=True)

df.loc['log_reg', metrics] = compute_metrics(log_reg, data_log_reg)
df.loc['random_forest', metrics] = compute_metrics(random_forest, data_random_forest)
df.loc['MLP', metrics] = compute_metrics(MLP, data_MLP)

df

In [None]:
# plot the accuracies
fig, ax = plt.subplots()
ax.bar(names, df['accuracy'])
ax.set_ylabel('accuracy')
ax.set_title('Accuracy of the models')
plt.show()

# plot precision and recall, using two axis
fig, ax = plt.subplots()
ax.plot(names, df['precision'], color='red')
ax.set_ylabel('precision', color='red')
ax.tick_params(axis='y', labelcolor='red')
ax.set_title('Precision and recall of the models')
ax2 = ax.twinx()
ax2.plot(names, df['recall'], color='blue')
ax2.set_ylabel('recall', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')
plt.show()

# plot AUPRC
fig, ax = plt.subplots()
ax.bar(names, df['AUPRC'])
ax.set_ylabel('AUPRC')
ax.set_title('AUPRC of the models')
plt.show()


### Compare the two best models

In [None]:
# Load the models
path = '../prediction_model/models/'
names = ['log_reg_classsic', 'random_forest_mix']

classic = jl.load(path + names[0] + '.joblib')
mix = jl.load(path + names[1] + '.joblib')

# Load the data
path = '../prediction_model/features/'
names = ['symptoms', 'betweenness', 'community_count', 'community_size']

symptoms = np.load(path + names[0] + '.npz')
betweenness = np.load(path + names[1] + '.npz')
community_count = np.load(path + names[2] + '.npz')
community_size = np.load(path + names[3] + '.npz')

data_classic = np.concatenate((symptoms['X'], symptoms['y']), axis=1)
data_mix = np.concatenate((betweenness['X'], community_count['X'], community_size['X'], community_count['y']), axis=1)

In [None]:
# create a dataframe with model names and metrics and the corresponding values
metrics = ['accuracy', 'precision', 'recall', 'AUPRC']
df = pd.DataFrame(columns=metrics)
df['model'] = names
df.set_index('model', inplace=True)

df.loc['log_reg', metrics] = compute_metrics(log_reg, data_log_reg)
df.loc['random_forest', metrics] = compute_metrics(random_forest, data_random_forest)
df.loc['MLP', metrics] = compute_metrics(MLP, data_MLP)

df

In [None]:
# plot the accuracies
fig, ax = plt.subplots()
ax.bar(names, df['accuracy'])
ax.set_ylabel('accuracy')
ax.set_title('Accuracy of the models')
plt.show()

# plot precision and recall, using two axis
fig, ax = plt.subplots()
ax.plot(names, df['precision'], color='red')
ax.set_ylabel('precision', color='red')
ax.tick_params(axis='y', labelcolor='red')
ax.set_title('Precision and recall of the models')
ax2 = ax.twinx()
ax2.plot(names, df['recall'], color='blue')
ax2.set_ylabel('recall', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')
plt.show()

# plot AUPRC
fig, ax = plt.subplots()
ax.bar(names, df['AUPRC'])
ax.set_ylabel('AUPRC')
ax.set_title('AUPRC of the models')
plt.show()