In [38]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
sns.set_theme()

## Cargar dataset

In [None]:
# cargar dataset y mostrar primeras filas

# df = pd.read_csv('../data/audicion.csv', sep=';') # cambiar separador
df = pd.read_csv('../data/audicion.csv')
df.head()

In [None]:
# mostrar dimensiones de df
df.shape

In [None]:
# contar valores en test_result
df['test_result'].value_counts()

## Análisis Exploratorio de Datos (EDA)

In [None]:
# gráfico conteo de test_result
sns.countplot(data=df, x='test_result', hue='test_result')

In [None]:
# gráfico boxplot de variables
df.plot(kind='box', subplots=True, figsize=(12,4))

In [None]:
# gráfico violinplot de age según test_result
sns.violinplot(data=df, x='test_result', y='age', hue='test_result')

In [None]:
# gráfico violinplot de age según test_result sin mirror
sns.violinplot(data=df, x='test_result', y='age', hue='test_result', split=True)

In [None]:
# gráfico scatterplot de age vs physical_score

# sns.scatterplot( x=df["age"], y=df["physical_score"] )
sns.scatterplot(data=df, x="age", y="physical_score")

In [None]:
# gráfico scatterplot con color según test_result
plt.figure(figsize=(12, 8))
# sns.set(rc={"figure.figsize": (12,8)})
sns.scatterplot(data=df, x="age", y="physical_score", hue='test_result')

In [None]:
# gráfico pairplot con regresión
sns.pairplot(data=df, hue='test_result', kind='reg')

In [None]:
# gráfico mapa de calor con correlaciones
sns.heatmap(df.corr().round(2), cmap='viridis', annot=True)

## Modelado

In [50]:
# separar features y variable objetivo
X = df.drop('test_result', axis=1)
y = df['test_result']

In [51]:
# dividir datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [52]:
# entrenar modelo de regresión logística y predecir
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Evaluación modelo

In [None]:
# calcular exactitud del modelo
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

In [None]:
# construir matriz de confusión
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# gráfico matriz de confusión
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No apto', 'Apto'])
disp.plot()
plt.show()

In [None]:
# calcular métricas adicionales
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

print("accuracy: ", accuracy_score(y_test, y_pred))
print("precision: ", precision_score(y_test, y_pred))
print("recall (Sensitivity): ", recall_score(y_test, y_pred))
print("recall (Specificity): ", recall_score(y_test, y_pred, pos_label=0))
print("F1-score: ", f1_score(y_test, y_pred))
print("AUC: ", roc_auc_score(y_test, y_pred))

In [None]:
# gráfico curva ROC
from sklearn.metrics import RocCurveDisplay, auc, roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Logistic Regression')

display.plot()

In [None]:
# personalizar curva ROC manualmente
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, linewidth=5)
plt.plot([0,1], [0,1], 'k--')
plt.title('ROC curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.show()