# Script: GMM_ML
<h4><span style="color:blue">Juan José Hoyos Urcué</span></h4>

### Dataset and Python libraries Upload

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import MinMaxScaler

fake_name = "fake_gmm.xlsx"
fake = pd.read_excel(fake_name)

original = pd.read_excel("../../without_data_augmentation/preprocesado.xlsx")

### Synthetic dataset shape

In [2]:
rows,cols = fake.shape
print("Fake generated dataset has {} rows and {} columns.".format(rows,cols))

Fake generated dataset has 30 rows and 10 columns.


### Synthetic Dataset - Target Variable Distribution

In [3]:
fake["cure_or_fail"].value_counts()
# 0 ure
# 1 fail

0    16
1    14
Name: cure_or_fail, dtype: int64

In [4]:
y_train = fake["cure_or_fail"]
X_train = fake.drop('cure_or_fail', 1)

y_test = original["cure_or_fail"]
X_test = original.drop('cure_or_fail', 1)

In [5]:
# Machine Learning Libraries

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Evaluation libraries

from sklearn import metrics
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [6]:

metricas = list()

def build_model(model, X_train, y_train, X_test, y_test):
    clf = model
    clf.fit(X_train, y_train)
    clf_pred = clf.predict(X_test)
    

    tmp = list()
    precision = precision_score(y_test,clf_pred)
    recall = recall_score(y_test,clf_pred)
    fscore = f1_score(y_test,clf_pred)

    tmp.append(str(type(model).__name__))
    tmp.append(precision)
    tmp.append(recall)
    tmp.append(fscore)
    metricas.append(tmp)
    
    print("Precision: ",precision)
    print("Recall:    ",recall)
    print("F1score:   ",fscore)
    
    return clf_pred,clf

### Logistic Regression

In [7]:
log_reg = LogisticRegression(C = 5, fit_intercept = True, max_iter = 50, penalty = 'l1',
                             solver = 'liblinear', tol = 0.001)
log_reg_pred = build_model(log_reg, X_train, y_train, X_test, y_test)

Precision:  0.5
Recall:     0.5714285714285714
F1score:    0.5333333333333333


### k-nearest neighbors (k=5)

In [8]:
knn = KNeighborsClassifier(algorithm = 'ball_tree', n_neighbors = 2, weights = 'distance')
knn_pred = build_model(knn, X_train, y_train, X_test, y_test)

Precision:  0.5
Recall:     0.5714285714285714
F1score:    0.5333333333333333


### Random Forest

In [9]:
rf = RandomForestClassifier(criterion = 'gini', max_depth = 31, max_features = 'sqrt', 
                            max_leaf_nodes = 11, min_samples_leaf = 3, 
                            min_samples_split = 6)

rf_pred = build_model(rf, X_train, y_train, X_test, y_test)

Precision:  0.375
Recall:     0.42857142857142855
F1score:    0.39999999999999997


### SVM (Support Vector Machines)

In [10]:
svm = SVC(C = 10, gamma = 0.001, kernel = 'linear')
svm_pred = build_model(svm, X_train, y_train, X_test, y_test)

Precision:  0.42857142857142855
Recall:     0.42857142857142855
F1score:    0.42857142857142855


### MLP (Multilayer Perceptron)

In [11]:
mlp = MLPClassifier(activation = 'tanh', alpha = 0.0001, 
                     hidden_layer_sizes = (9, 9, 9), 
                     learning_rate = 'constant', solver = 'adam',max_iter = 500)

mlp_pred = build_model(mlp, X_train, y_train, X_test, y_test)

Precision:  0.42857142857142855
Recall:     0.42857142857142855
F1score:    0.42857142857142855


### Save Results

In [12]:
result_table = pd.DataFrame(columns = ['Model', 'Precision','Recall','F1Score'])

for i in range(len(metricas)):
    result_table = result_table.append({'Model':metricas[i][0],
                                        'Precision':metricas[i][1], 
                                        'Recall':metricas[i][2], 
                                        'F1Score':metricas[i][3]}, ignore_index = True)
    
result_table.to_excel("../results/fake_gmm.xlsx", index = False)