In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from tensorflow.keras import layers, models, callbacks

In [None]:
# CARGA Y EXPLORACIÓN DEL DATASET
df = pd.read_csv("student_exam_scores.csv")

print("Shape del dataset:", df.shape)
print("Columnas:", list(df.columns))
print(df.info())

print(df.isnull().sum())

mediana_score = df['exam_score'].median()
print(f"\nMediana de exam_score: {mediana_score:.2f}")

df['aprobado'] = (df['exam_score'] >= mediana_score).astype(int)

print(df['aprobado'].value_counts())
print(f"Porcentaje de aprobados: {df['aprobado'].mean()*100:.1f}%")



Shape del dataset: (200, 6)
Columnas: ['student_id', 'hours_studied', 'sleep_hours', 'attendance_percent', 'previous_scores', 'exam_score']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   student_id          200 non-null    object 
 1   hours_studied       200 non-null    float64
 2   sleep_hours         200 non-null    float64
 3   attendance_percent  200 non-null    float64
 4   previous_scores     200 non-null    int64  
 5   exam_score          200 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 9.5+ KB
None
student_id            0
hours_studied         0
sleep_hours           0
attendance_percent    0
previous_scores       0
exam_score            0
dtype: int64

Mediana de exam_score: 34.05
aprobado
0    100
1    100
Name: count, dtype: int64
Porcentaje de aprobados: 50.0%


In [None]:
# PREPARACIÓN DE DATOS

X = df[['hours_studied', 'sleep_hours', 'attendance_percent', 'previous_scores']]
y = df['aprobado'].values

print(f"\nShape de X: {X.shape}")
print(f"Shape de y: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nX_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")




Shape de X: (200, 4)
Shape de y: (200,)

X_train shape: (160, 4)
X_test shape: (40, 4)
y_train shape: (160,)
y_test shape: (40,)

X_train_scaled shape: (160, 4)
X_test_scaled shape: (40, 4)


In [None]:
# CONSTRUCCIÓN DEL MODELO
def build_student_model(input_dim: int) -> tf.keras.Model:

    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(16, activation="relu", name="hidden_layer_1"), # Elegimos 16 según la cantidad de entradas, utilizando la regla de potencias de 2. Evitamos poner muchas para evitar el sobreajuste
        layers.Dropout(0.3, name="dropout_1"),
        layers.Dense(8, activation="relu", name="hidden_layer_2"), # 8 neuronas adicionales en la segunda capa. Solo para refinarlo
        layers.Dropout(0.3, name="dropout_2"),                     # Usamos un dropout de 30% para evitar que el modelo memorice el dataset. En este caso, quitamos esta cantidad debido a los pocos datos del dataset (200)
        layers.Dense(1, activation="sigmoid", name="output_layer")
    ])

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
    )

    return model

model = build_student_model(X_train_scaled.shape[1])
model.summary()

In [None]:
callbacks_list = [
    callbacks.EarlyStopping(
        monitor="val_auc",
        mode="max",
        patience=15,
        restore_best_weights=True,
        verbose=1
    ),
    callbacks.ModelCheckpoint(
        filepath="best_student_model.keras",
        monitor="val_auc",
        mode="max",
        save_best_only=True,
        verbose=1
    ),
    callbacks.ReduceLROnPlateau(
        monitor="val_auc",
        factor=0.5,
        patience=8,
        min_lr=1e-6,
        verbose=1
    )
]

In [None]:
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    batch_size=16,
    epochs=100,
    callbacks=callbacks_list,
    verbose=1
)

test_loss, test_accuracy, test_auc = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Pérdida en prueba: {test_loss:.4f}")
print(f"Precisión en prueba: {test_accuracy:.4f}")
print(f"AUC en prueba: {test_auc:.4f}")

y_pred_prob = model.predict(X_test_scaled, verbose=0)
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred, target_names=['Reprobado', 'Aprobado']))

cm = confusion_matrix(y_test, y_pred)
print(cm)

roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"\nAUC: {roc_auc:.4f}")


Epoch 1/100
[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m14s[0m 2s/step - accuracy: 0.5000 - auc: 0.4417 - loss: 0.6992
Epoch 1: val_auc improved from -inf to 0.39792, saving model to best_student_model.keras
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 90ms/step - accuracy: 0.4855 - auc: 0.4971 - loss: 0.6980 - val_accuracy: 0.3125 - val_auc: 0.3979 - val_loss: 0.7141 - learning_rate: 0.0010
Epoch 2/100
[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 23ms/step - accuracy: 0.7500 - auc: 0.7143 - loss: 0.6127
Epoch 2: val_auc improved from 0.39792 to 0.40000, saving model to best_student_model.keras
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5798 - auc: 0.5816 - loss: 0.6708 - val_accuracy: 0.3750 - val_auc: 0.4000 - val_loss: 0.7080 - learning_rate: 0.0010
Epoch 3/100
[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 181ms/step - accuracy: 0.4375 - auc: 0.2812 - loss: 0.7462
Epoch 3: val_au