In [5]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [6]:
data = pd.read_csv("data/student_info.csv")

## Preprocesamiento

In [7]:
# Limpieza de datos
data = data.drop(["student_id",	"name", "lunch_type"], axis=1)

In [8]:
data.head()

Unnamed: 0,gender,age,grade_level,math_score,reading_score,writing_score,attendance_rate,parent_education,study_hours,internet_access,extra_activities,final_result
0,Other,17,10,74,61,90,94.660002,Master's,4.120192,Yes,Yes,Fail
1,Male,17,12,99,70,91,93.173227,Bachelor's,2.886505,No,No,Pass
2,Other,17,9,59,60,99,98.631098,PhD,1.909926,No,No,Fail
3,Other,17,12,70,88,69,96.41962,PhD,1.66474,No,No,Pass
4,Male,15,9,85,77,94,91.332105,PhD,2.330918,Yes,No,Pass


In [9]:
# conversion de object a category
data["gender"] = data["gender"].astype("category")
data["parent_education"] = data["parent_education"].astype("category")
data["internet_access"] = data["internet_access"].astype("category")
data["extra_activities"] = data["extra_activities"].astype("category")

In [10]:
from sklearn.preprocessing import OneHotEncoder

# columnas categoricas
cols_categoricas = ["gender"]

# se inicializa la instancia del encoder
onehotencoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# se transforman los datos
encoded_array = onehotencoder.fit_transform(data[cols_categoricas])

# se convierten en un dataframe con sus respectivas columnas
encoded_df = pd.DataFrame(encoded_array, columns=onehotencoder.get_feature_names_out(cols_categoricas))

# se pegan con el dataframe principal por columnas
data = pd.concat([data, encoded_df], axis=1)

In [11]:
# se eliminan las columnas transformadas
data.drop(columns=cols_categoricas, axis=1, inplace=True)

In [12]:
from sklearn.preprocessing import OrdinalEncoder

categoricas_educativas = ["High School", "Bachelor's", "Master's", "PhD"]

encoder = OrdinalEncoder(categories=[categoricas_educativas])

data_categoria_ordenada = pd.DataFrame(encoder.fit_transform(data[["parent_education"]]), columns=["parent_education_trans"])
data = pd.concat([data, data_categoria_ordenada], axis=1)


In [13]:
data.drop(columns="parent_education", axis=1, inplace=True)

In [14]:
# cambiando los valores de la variable objetivo a numericas
data["final_result"] = data["final_result"].map({'Fail': 0, 'Pass': 1})
data["internet_access"] = data["internet_access"].map({"Yes": 1, "No": 0})
data["extra_activities"] = data["extra_activities"].map({"Yes": 1, "No": 0})

In [15]:
# cambiando los valores de la variable objetivo a numericas
data["final_result"] = data["final_result"].map({'Fail': 0, 'Pass': 1})
data["internet_access"] = data["internet_access"].map({"Yes": 1, "No": 0})
data["extra_activities"] = data["extra_activities"].map({"Yes": 1, "No": 0})

## Entrenamiento y evaluacion

In [None]:
# Seleccion de variables o etiquetas
X = data.drop("final_result", axis=1)
y = data[["final_result"]]

In [None]:
# Dividir el conjunto de entrenamiento entre prueba y entrenamiento
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# estandarizando valores numericos
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)
print(X_train_scaler)

In [None]:
# entrenando modelo de regresion linear
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_scaler, y_train)
y_pred = model.predict(X_test)

In [None]:
# evaluacion de desempe√±o de modelo
from sklearn.metrics import accuracy_score

y_pred_tr = model.predict(X_train)
y_pred_ts = model.predict(X_test)

acc_train = accuracy_score(y_train, y_pred_tr)
acc_test = accuracy_score(y_test, y_pred_ts)
print(f"Exactitud de entrenamiento: {acc_train:.4f}")
print(f"Exactitud de prueba: {acc_test:.4f}")