# Logistic Regression

## Imports

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics as mt
from sklearn import linear_model as lm
import numpy as np


## Load datasets

In [3]:
x_train = pd.read_csv('./X_training.csv')
x_test = pd.read_csv('./X_test.csv')
x_val = pd.read_csv('./X_validation.csv')
y_train = pd.read_csv('./y_training.csv')
y_test = pd.read_csv('./y_test.csv')
y_val = pd.read_csv('./y_validation.csv')


## Model training

### training dataset

In [13]:
# model

logistic_model = lm.LogisticRegression(solver= 'liblinear')

# fit
logistic_model.fit(x_train, y_train)

#predict
y_pred_train = logistic_model.predict_proba(x_train)[:,1]

# Curva precision x recall para escolhe melhor threshold
precision, recall, thresholds = mt.precision_recall_curve(y_train, y_pred_train)

distances = np.abs(recall-precision) # Em que precision = recall
best_th = thresholds[np.argmin(distances)]
y_pred_train = (y_pred_train >= best_th).astype(int)
#print(f'Melhor threshold para precision = recall de: {best_th:.2f}')
acc = mt.accuracy_score(y_train, y_pred_train)
precision = mt.precision_score(y_train, y_pred_train)
recall = mt.recall_score(y_train, y_pred_train)
f1 = mt.f1_score(y_train, y_pred_train)
print(f'acurácia: {acc:.3f}')
print(f'precision: {precision:.3f}')
print(f'recall: {recall:.3f}')
print(f'f1-score: {f1:.3f}')

  y = column_or_1d(y, warn=True)


acurácia: 0.807
precision: 0.778
recall: 0.778
f1-score: 0.778


### Validation dataset

In [28]:
# model

logistic_model = lm.LogisticRegression(max_iter=5000)

# fit
logistic_model.fit(x_train, y_train)

#predict
y_pred_val = logistic_model.predict_proba(x_val)[:,1]

# Curva precision x recall para escolhe melhor threshold
precision_val, recall_val, thresholds_val = mt.precision_recall_curve(y_val, y_pred_val)

distances_val = np.abs(recall_val-precision_val) # Em que precision = recall
best_th_val = thresholds_val[np.argmin(distances_val)]
y_pred_val = (y_pred_val >= best_th_val).astype(int)
print(f'Melhor threshold para precision = recall de: {best_th_val:.2f}')
acc_val = mt.accuracy_score(y_val, y_pred_val)
precision_val = mt.precision_score(y_val, y_pred_val)
recall_val = mt.recall_score(y_val, y_pred_val)
f1_val = mt.f1_score(y_val, y_pred_val)
print(f'acurácia: {acc_val:.3f}')
print(f'precision: {precision_val:.3f}')
print(f'recall: {recall_val:.3f}')
print(f'f1-score: {f1_val:.3f}')


  y = column_or_1d(y, warn=True)


Melhor threshold para precision = recall de: 0.45
acurácia: 0.872
precision: 0.852
recall: 0.852
f1-score: 0.852


### Test dataset

In [27]:
# model
logistic_model = lm.LogisticRegression(max_iter= 5000)
# fit
logistic_model.fit(np.concatenate((x_train,x_val)),
              np.concatenate((y_train,y_val)))

#predict
y_pred_test = logistic_model.predict_proba(x_test)[:,1]
#y_pred_test = logistic_model.predict(x_test)

# Curva precision x recall para escolhe melhor threshold
precision_test, recall_test, thresholds_test = mt.precision_recall_curve(y_test, y_pred_test)
distances_test = np.abs(recall_test-precision_test) # Em que precision = recall
best_th_test = thresholds_test[np.argmin(distances_test)]
y_pred_test = (y_pred_test >= best_th_test).astype(int)
acc_test = mt.accuracy_score(y_test, y_pred_test)
precision_test = mt.precision_score(y_test, y_pred_test)
recall_test = mt.recall_score(y_test, y_pred_test)
f1_test = mt.f1_score(y_test, y_pred_test)
#print(f'Melhor threshold para precision = recall de: {best_th_test:.2f}')
print(f'acurácia: {acc_test:.3f}')
print(f'precision: {precision_test:.3f}')
print(f'recall: {recall_test:.3f}')
print(f'f1-score: {f1_test:.3f}')

  y = column_or_1d(y, warn=True)


acurácia: 0.869
precision: 0.851
recall: 0.851
f1-score: 0.851


