In [1]:
import pandas as pd 
from joblib import load, dump

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, f1_score, recall_score, matthews_corrcoef 

In [4]:
def get_metrics(y_true=None, y_predict=None, name_model=""):
    row =[
        name_model,
        precision_score(y_true=y_true, y_pred=y_predict),
        accuracy_score(y_true=y_true, y_pred=y_predict),
        recall_score(y_true=y_true, y_pred=y_predict),
        f1_score(y_true=y_true, y_pred=y_predict),
        matthews_corrcoef(y_true=y_true, y_pred=y_predict),
    ]
    return row

In [5]:
df_train = pd.read_csv("../../results/4_train_dataset.csv")
df_test_mix = pd.read_csv("../../results/4_testing_mix.csv")
df_test_neg = pd.read_csv("../../results/4_testing_negatives.csv")

In [6]:
scaler = load("../../results/scaler_process.joblib")

In [7]:
response = df_train['Outcome'].values
df_values = df_train.drop(columns=['Outcome']).values

In [8]:
X_train, X_val, y_train, y_val = train_test_split(df_values, response, test_size=.2, random_state=42)

In [9]:
knn_class = KNeighborsClassifier()
knn_class.fit(X_train, y_train)

In [10]:
dt_class = DecisionTreeClassifier()
dt_class.fit(X_train, y_train)

In [11]:
rf_class = RandomForestClassifier()
rf_class.fit(X_train, y_train)

In [12]:
svc_class = SVC()
svc_class.fit(X_train, y_train)

In [13]:
gnv_class = GaussianNB()
gnv_class.fit(X_train, y_train)

In [14]:
sgd_class = SGDClassifier()
sgd_class.fit(X_train, y_train)

In [15]:
knn_predict_val = knn_class.predict(X_val)
dt_predict_val = dt_class.predict(X_val)
rf_predict_val = rf_class.predict(X_val)
svc_predict_val = svc_class.predict(X_val)
gnv_predict_val = gnv_class.predict(X_val)
sgd_predict_val = sgd_class.predict(X_val)


In [16]:
matrix_data = [
    get_metrics(y_true=y_val, y_predict=knn_predict_val, name_model="KNN"),
    get_metrics(y_true=y_val, y_predict=dt_predict_val, name_model="DT"),
    get_metrics(y_true=y_val, y_predict=rf_predict_val, name_model="RF"),
    get_metrics(y_true=y_val, y_predict=svc_predict_val, name_model="SVC"),
    get_metrics(y_true=y_val, y_predict=gnv_predict_val, name_model="GNV"),
    get_metrics(y_true=y_val, y_predict=sgd_predict_val, name_model="SGD")
]
df_performance_val = pd.DataFrame(data = matrix_data, columns=['name_model', 'precision_score', 'accuracy_score', 'recall_score', 'f1_score', 'matthews_corrcoef'])
df_performance_val

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef
0,KNN,0.710526,0.742857,0.794118,0.75,0.490167
1,DT,0.617647,0.628571,0.617647,0.617647,0.256536
2,RF,0.764706,0.771429,0.764706,0.764706,0.542484
3,SVC,0.702703,0.728571,0.764706,0.732394,0.459714
4,GNV,0.757576,0.757143,0.735294,0.746269,0.513702
5,SGD,0.571429,0.628571,0.941176,0.711111,0.342997


In [17]:
df_test_mix['Outcome'].value_counts()


Outcome
1    25
0    14
Name: count, dtype: int64

In [18]:
response_test_mix = df_test_mix["Outcome"]
df_test_mix_to_standarize = df_test_mix[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]


In [21]:
scaler_x_mix = scaler.transform(df_test_mix_to_standarize.values)
df_scaler_mix = pd.DataFrame(data = scaler_x_mix, columns= df_test_mix_to_standarize.columns)
x_test_mix = df_scaler_mix
x_test_mix.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.0,0.51269,0.615385,0.346939,0.40714,0.42,0.211587,0.318182
1,0.384615,0.395939,0.461538,0.419111,0.40714,0.674,0.549118,0.378788
2,0.692308,0.771574,0.75,0.693878,0.872449,0.684,0.74979,0.5
3,0.230769,0.563452,0.596154,0.419111,0.40714,0.452,0.119228,0.318182
4,0.0,0.604061,0.664476,0.419111,0.40714,0.648,0.118388,0.363636


In [23]:
knn_predict_test_mix = knn_class.predict(x_test_mix.values)
dt_predict_test_mix= dt_class.predict(x_test_mix.values)
rf_predict_test_mix= rf_class.predict(x_test_mix.values)
svc_predict_test_mix = svc_class.predict(x_test_mix.values)
gnv_predict_test_mix = gnv_class.predict(x_test_mix.values)
sgd_predict_test_mix = sgd_class.predict(x_test_mix.values)

In [24]:
matrix_data1 = [
    get_metrics(y_true=response_test_mix, y_predict=knn_predict_test_mix, name_model="KNN"),
    get_metrics(y_true=response_test_mix, y_predict=dt_predict_test_mix, name_model="DT"),
    get_metrics(y_true=response_test_mix, y_predict=rf_predict_test_mix,name_model="RF"),
    get_metrics(y_true=response_test_mix, y_predict=svc_predict_test_mix, name_model="SVC"),
    get_metrics(y_true=response_test_mix, y_predict=gnv_predict_test_mix, name_model="GNV"),
    get_metrics(y_true=response_test_mix, y_predict=sgd_predict_test_mix, name_model="SGD")
]
df_performance_test_mix = pd.DataFrame(data = matrix_data1, columns=['name_model', 'precision_score', 'accuracy_score', 'recall_score', 'f1_score', 'matthews_corrcoef'])
df_performance_test_mix

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef
0,KNN,0.0,0.358974,0.0,0.0,0.0
1,DT,0.641026,0.641026,1.0,0.78125,0.0
2,RF,0.0,0.358974,0.0,0.0,0.0
3,SVC,0.0,0.358974,0.0,0.0,0.0
4,GNV,0.0,0.358974,0.0,0.0,0.0
5,SGD,0.736842,0.589744,0.56,0.636364,0.194685


In [25]:
response_negative = df_test_neg["Outcome"]
df_negative_to_standarize = df_test_neg[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]

scaler_x_neg = scaler.transform(df_negative_to_standarize.values)

df_scaler_neg = pd.DataFrame(data = scaler_x_neg, columns= df_negative_to_standarize.columns)
x_test_neg = df_scaler_neg
x_test_neg.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.384615,0.588832,0.711538,0.419111,0.40714,0.512,0.168766,0.454545
1,0.307692,0.558376,0.634615,0.419111,0.40714,0.638,0.395466,0.439394
2,0.076923,0.598985,0.557692,0.734694,0.479592,0.666,0.219144,0.348485
3,0.230769,0.451777,0.711538,0.326531,0.433673,0.608,0.462636,0.575758
4,0.0,0.593909,0.769231,0.632653,0.270408,0.904,0.074727,0.363636


In [27]:
knn_predict_negative = knn_class.predict(x_test_neg.values)
dt_predict_negative= dt_class.predict(x_test_neg.values)
rf_predict_negative= rf_class.predict(x_test_neg.values)
svc_predict_negative = svc_class.predict(x_test_neg.values)
gnv_predict_negative = gnv_class.predict(x_test_neg.values)
sgd_predict_negative = sgd_class.predict(x_test_neg.values)

matrix_data1 = [
    get_metrics(y_true=response_negative, y_predict=knn_predict_negative, name_model="KNN"),
    get_metrics(y_true=response_negative, y_predict=dt_predict_negative, name_model="DT"),
    get_metrics(y_true=response_negative, y_predict=rf_predict_negative,name_model="RF"),
    get_metrics(y_true=response_negative, y_predict=svc_predict_negative, name_model="SVC"),
    get_metrics(y_true=response_negative, y_predict=gnv_predict_negative, name_model="GNV"),
    get_metrics(y_true=response_negative, y_predict=sgd_predict_negative, name_model="SGD")
]
df_performance_negative = pd.DataFrame(data = matrix_data1, columns=['name_model', 'precision_score', 'accuracy_score', 'recall_score', 'f1_score', 'matthews_corrcoef'])
df_performance_negative

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef
0,KNN,0.0,1.0,0.0,0.0,0.0
1,DT,0.0,0.0,0.0,0.0,0.0
2,RF,0.0,1.0,0.0,0.0,0.0
3,SVC,0.0,1.0,0.0,0.0,0.0
4,GNV,0.0,1.0,0.0,0.0,0.0
5,SGD,0.0,0.755365,0.0,0.0,0.0


In [28]:
df_performance_negative['status'] = 'Testing negative'
df_performance_test_mix['status'] = 'Testing mix'
df_performance_val['status'] = 'validacion'

df_perfomances = pd.concat([df_performance_negative, df_performance_test_mix, df_performance_val], axis=0)
df_perfomances

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef,status
0,KNN,0.0,1.0,0.0,0.0,0.0,Testing negative
1,DT,0.0,0.0,0.0,0.0,0.0,Testing negative
2,RF,0.0,1.0,0.0,0.0,0.0,Testing negative
3,SVC,0.0,1.0,0.0,0.0,0.0,Testing negative
4,GNV,0.0,1.0,0.0,0.0,0.0,Testing negative
5,SGD,0.0,0.755365,0.0,0.0,0.0,Testing negative
0,KNN,0.0,0.358974,0.0,0.0,0.0,Testing mix
1,DT,0.641026,0.641026,1.0,0.78125,0.0,Testing mix
2,RF,0.0,0.358974,0.0,0.0,0.0,Testing mix
3,SVC,0.0,0.358974,0.0,0.0,0.0,Testing mix


In [29]:
df_perfomances.sort_values(by='matthews_corrcoef', ascending=False)

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef,status
2,RF,0.764706,0.771429,0.764706,0.764706,0.542484,validacion
4,GNV,0.757576,0.757143,0.735294,0.746269,0.513702,validacion
0,KNN,0.710526,0.742857,0.794118,0.75,0.490167,validacion
3,SVC,0.702703,0.728571,0.764706,0.732394,0.459714,validacion
5,SGD,0.571429,0.628571,0.941176,0.711111,0.342997,validacion
1,DT,0.617647,0.628571,0.617647,0.617647,0.256536,validacion
5,SGD,0.736842,0.589744,0.56,0.636364,0.194685,Testing mix
0,KNN,0.0,1.0,0.0,0.0,0.0,Testing negative
1,DT,0.0,0.0,0.0,0.0,0.0,Testing negative
2,RF,0.0,1.0,0.0,0.0,0.0,Testing negative
