# LIBRERIAS

# Datos
Los datos están [acá](https://archive.ics.uci.edu/dataset/320/student+performance)

# Variables para trabajar

* studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
*  failures - number of past class failures (numeric: n if 1<=n<3, else 4)
* famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
* freetime - free time after school (numeric: from 1 - very low to 5 - very high)
* goout - going out with friends (numeric: from 1 - very low to 5 - very high)
* Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
* Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
* health - current health status (numeric: from 1 - very bad to 5 - very good)
* absences - number of school absences (numeric: from 0 to 93)

### Cargar la base

### Obtener las características y los datos del dataset

# EJERCICIO #1

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from ucimlrepo import fetch_ucirepo

# 1. Cargar los datos
student_performance = fetch_ucirepo(id=320) 
X = student_performance.data.features  # Características
y = student_performance.data.targets   # Variables objetivo (notas)

# 2. Definir columnas categóricas y numéricas
categorical_cols = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian',
                    'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet','romantic']
numeric_cols = ['studytime', 'failures', 'absences']

# 3. Codificación de variables categóricas
encoder = OneHotEncoder(drop='first', sparse_output=False)  # drop='first' para evitar multicolinealidad
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

# 4. Normalización de variables numéricas
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols)

# 5. Unir las columnas codificadas y normalizadas con las demás columnas no categóricas
X_preprocessed = pd.concat([X_encoded, X_scaled, X.drop(columns=categorical_cols + numeric_cols)], axis=1)

# Verificar si quedan columnas de tipo object (categóricas no convertidas)
print(X_preprocessed.dtypes)

# 6. Separar las variables objetivo
y1 = y['G1']
y2 = y['G2']
y3 = y['G3']

# 7. Función para entrenar y evaluar regresión lineal
def regresion_lineal(X, y_target):
    X_train, X_test, y_train, y_test = train_test_split(X, y_target, test_size=0.2, random_state=42)
    
    # Crear el modelo de regresión lineal
    modelo = LinearRegression()
    modelo.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = modelo.predict(X_test)
    
    # Métricas de evaluación
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Resultados para {y_target.name}:")
    print(f"MSE: {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")
    print("-" * 30)

# 8. Realizar las regresiones para G1, G2 y G3
regresion_lineal(X_preprocessed, y1)
regresion_lineal(X_preprocessed, y2)
regresion_lineal(X_preprocessed, y3)



school_MS            float64
sex_M                float64
address_U            float64
famsize_LE3          float64
Pstatus_T            float64
Mjob_health          float64
Mjob_other           float64
Mjob_services        float64
Mjob_teacher         float64
Fjob_health          float64
Fjob_other           float64
Fjob_services        float64
Fjob_teacher         float64
reason_home          float64
reason_other         float64
reason_reputation    float64
guardian_mother      float64
guardian_other       float64
schoolsup_yes        float64
famsup_yes           float64
paid_yes             float64
activities_yes       float64
nursery_yes          float64
higher_yes           float64
internet_yes         float64
romantic_yes         float64
studytime            float64
failures             float64
absences             float64
age                    int64
Medu                   int64
Fedu                   int64
traveltime             int64
famrel                 int64
freetime      

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from ucimlrepo import fetch_ucirepo

# 1. Cargar los datos
student_performance = fetch_ucirepo(id=320)
X = student_performance.data.features  # Características
y = student_performance.data.targets   # Variables objetivo (notas)

# 2. Definir columnas categóricas y numéricas
categorical_cols = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian',
                    'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
numeric_cols = ['studytime', 'failures', 'absences']

# 3. Codificación de variables categóricas
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

# 4. Normalización de variables numéricas
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols)

# 5. Unir las columnas codificadas y normalizadas
X_preprocessed = pd.concat([X_encoded, X_scaled], axis=1)

# 6. Separar las variables objetivo
y1 = y['G1']
y2 = y['G2']
y3 = y['G3']

# 7. Función para entrenar y evaluar regresión lineal con Ridge
def regresion_lineal(X, y_target, alpha=1.0):
    X_train, X_test, y_train, y_test = train_test_split(X, y_target, test_size=0.2, random_state=42)
    
    # Modelo de Ridge
    model = Ridge(alpha=alpha)
    
    # Entrenar el modelo
    model.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = model.predict(X_test)
    
    # Métricas de evaluación
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Resultados para {y_target.name}:")
    print(f"MSE: {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")
    print("-" * 30)

# 8. Realizar las regresiones para G1, G2 y G3
regresion_lineal(X_preprocessed, y1, alpha=1.0)
regresion_lineal(X_preprocessed, y2, alpha=1.0)
regresion_lineal(X_preprocessed, y3, alpha=1.0)




Resultados para G1:
MSE: 7.03
R2 Score: 0.18
------------------------------
Resultados para G2:
MSE: 7.67
R2 Score: 0.15
------------------------------
Resultados para G3:
MSE: 8.11
R2 Score: 0.17
------------------------------


In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from ucimlrepo import fetch_ucirepo

# 1. Cargar los datos
student_performance = fetch_ucirepo(id=320)
X = student_performance.data.features  # Características
y = student_performance.data.targets   # Variables objetivo (notas)

# 2. Definir columnas categóricas y numéricas
categorical_cols = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian',
                    'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
numeric_cols = ['studytime', 'failures', 'absences']

# 3. Codificación de variables categóricas
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

# 4. Normalización de variables numéricas
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols)

# 5. Unir las columnas codificadas y normalizadas
X_preprocessed = pd.concat([X_encoded, X_scaled], axis=1)

# 6. Separar las variables objetivo
y1 = y['G1']
y2 = y['G2']
y3 = y['G3']

# 7. Función para entrenar y evaluar el modelo de Random Forest
def random_forest_regressor(X, y_target):
    X_train, X_test, y_train, y_test = train_test_split(X, y_target, test_size=0.2, random_state=42)
    
    # Crear el modelo de Random Forest
    model = RandomForestRegressor(n_estimators=100, random_state=42)  # Puedes ajustar n_estimators según sea necesario
    
    # Entrenar el modelo
    model.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = model.predict(X_test)
    
    # Métricas de evaluación
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Resultados para {y_target.name}:")
    print(f"MSE: {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")
    print("-" * 30)

# 8. Realizar las regresiones para G1, G2 y G3
random_forest_regressor(X_preprocessed, y1)
random_forest_regressor(X_preprocessed, y2)
random_forest_regressor(X_preprocessed, y3)


Resultados para G1:
MSE: 6.99
R2 Score: 0.19
------------------------------
Resultados para G2:
MSE: 7.36
R2 Score: 0.18
------------------------------
Resultados para G3:
MSE: 8.50
R2 Score: 0.13
------------------------------


In [32]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


student_performance = fetch_ucirepo(id=320)


X = student_performance.data.features
y = student_performance.data.targets


X_dummies = pd.get_dummies(X, drop_first=True)


G1 = y['G1']
G2 = y['G2']
G3 = y['G3']

def fit_and_evaluate_model(target_variable, feature_data):
    X_train, X_test, y_train, y_test = train_test_split(feature_data, target_variable, test_size=0.3, random_state=42)
    
    model = Lasso(alpha=0.1) 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return model, mse, r2

# Store results
results = {}
for g, target in zip(['G1', 'G2', 'G3'], [G1, G2, G3]):
    model, mse, r2 = fit_and_evaluate_model(target, X_dummies)
    results[g] = {'model': model, 'MSE': mse, 'R²': r2}
    print(f"{g} Model - MSE: {mse:.2f}, R²: {r2:.2f}")

# Optionally, try a Random Forest model as well
def fit_random_forest(target_variable, feature_data):
    X_train, X_test, y_train, y_test = train_test_split(feature_data, target_variable, test_size=0.3, random_state=42)
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return model, mse, r2

for g, target in zip(['G1', 'G2', 'G3'], [G1, G2, G3]):
    model, mse, r2 = fit_random_forest(target, X_dummies)
    results[g] = {'model': model, 'MSE': mse, 'R²': r2}
    print(f"{g} Random Forest Model - MSE: {mse:.2f}, R²: {r2:.2f}")



G1 Model - MSE: 6.41, R²: 0.23
G2 Model - MSE: 7.79, R²: 0.21
G3 Model - MSE: 8.32, R²: 0.25
G1 Random Forest Model - MSE: 6.68, R²: 0.20
G2 Random Forest Model - MSE: 7.75, R²: 0.21
G3 Random Forest Model - MSE: 8.24, R²: 0.26


In [33]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Fetch the dataset
student_performance = fetch_ucirepo(id=320)

# Prepare the features and targets
X = student_performance.data.features
y = student_performance.data.targets

# Create dummy variables
X_dummies = pd.get_dummies(X, drop_first=True)

# Define the target variables
G1 = y['G1']
G2 = y['G2']
G3 = y['G3']

# Calculate the correlation matrix
correlation_matrix = pd.concat([X_dummies, y], axis=1).corr()

# Print the correlation matrix as text
print("Matriz de Correlación:")
print(correlation_matrix.to_string())

def fit_and_evaluate_model(target_variable, feature_data):
    X_train, X_test, y_train, y_test = train_test_split(feature_data, target_variable, test_size=0.3, random_state=42)
    
    model = Lasso(alpha=0.1) 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return model, mse, r2

# Store results for Lasso model
results = {}
for g, target in zip(['G1', 'G2', 'G3'], [G1, G2, G3]):
    model, mse, r2 = fit_and_evaluate_model(target, X_dummies)
    results[g] = {'model': model, 'MSE': mse, 'R²': r2}
    print(f"{g} Model - MSE: {mse:.2f}, R²: {r2:.2f}")

# Optionally, try a Random Forest model as well
def fit_random_forest(target_variable, feature_data):
    X_train, X_test, y_train, y_test = train_test_split(feature_data, target_variable, test_size=0.3, random_state=42)
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return model, mse, r2

for g, target in zip(['G1', 'G2', 'G3'], [G1, G2, G3]):
    model, mse, r2 = fit_random_forest(target, X_dummies)
    results[g] = {'model': model, 'MSE': mse, 'R²': r2}
    print(f"{g} Random Forest Model - MSE: {mse:.2f}, R²: {r2:.2f}")



Matriz de Correlación:
                        age      Medu      Fedu  traveltime  studytime  failures    famrel  freetime     goout      Dalc      Walc    health  absences  school_MS     sex_M  address_U  famsize_LE3  Pstatus_T  Mjob_health  Mjob_other  Mjob_services  Mjob_teacher  Fjob_health  Fjob_other  Fjob_services  Fjob_teacher  reason_home  reason_other  reason_reputation  guardian_mother  guardian_other  schoolsup_yes  famsup_yes  paid_yes  activities_yes  nursery_yes  higher_yes  internet_yes  romantic_yes        G1        G2        G3
age                1.000000 -0.107832 -0.121050    0.034490  -0.008415  0.319968 -0.020559 -0.004910  0.112805  0.134768  0.086357 -0.008750  0.149998   0.087170 -0.043662  -0.025848    -0.002470  -0.005631    -0.100237    0.038776      -0.034880     -0.046692    -0.103504    0.058406      -0.024570     -0.054154    -0.014716     -0.006385          -0.016565        -0.048726        0.330353      -0.167841   -0.101894 -0.005458       -0.054279 

In [40]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Cargar el dataset de estudiantes
student_performance = fetch_ucirepo(id=320)

# Obtener características y objetivos
X = student_performance.data.features
y = student_performance.data.targets

# Convertir variables categóricas en variables dummy
X_dummies = pd.get_dummies(X, drop_first=True)

# Definir las variables objetivo
G1 = y['G1']
G2 = y['G2']
G3 = y['G3']
print(X_dummies.columns)
# Eliminar columnas irrelevantes
columns_to_drop = ['school_MS', 'sex_M', 'address', 'famsize', 'Pstatus', 'reason', 
                   'guardian', 'school_sup', 'famsup', 'paid', 'activities', 
                   'nursery', 'higher', 'internet', 'romantic']

X_dummies_cleaned = X_dummies.drop(columns=columns_to_drop, errors='ignore')

display(X_dummies_cleaned)
# Almacenar resultados
results = {}

# Entrenar y evaluar el modelo Lasso
for g, target in zip(['G1', 'G2', 'G3'], [G1, G2, G3]):
    X_train, X_test, y_train, y_test = train_test_split(X_dummies_cleaned, target, test_size=0.3, random_state=42)
    
    model = Lasso(alpha=0.1) 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[g] = {'model': model, 'MSE': mse, 'R²': r2}
    print(f"{g} Model - MSE: {mse:.2f}, R²: {r2:.2f}")

# Entrenar y evaluar el modelo Random Forest
for g, target in zip(['G1', 'G2', 'G3'], [G1, G2, G3]):
    X_train, X_test, y_train, y_test = train_test_split(X_dummies_cleaned, target, test_size=0.3, random_state=42)
    
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[g] = {'model': model, 'MSE': mse, 'R²': r2}
    print(f"{g} Random Forest Model - MSE: {mse:.2f}, R²: {r2:.2f}")



Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'school_MS',
       'sex_M', 'address_U', 'famsize_LE3', 'Pstatus_T', 'Mjob_health',
       'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_home',
       'reason_other', 'reason_reputation', 'guardian_mother',
       'guardian_other', 'schoolsup_yes', 'famsup_yes', 'paid_yes',
       'activities_yes', 'nursery_yes', 'higher_yes', 'internet_yes',
       'romantic_yes'],
      dtype='object')


Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,True,False,True,False,False,False,True,True,False,False
1,17,1,1,1,2,0,5,3,3,1,...,False,False,False,True,False,False,False,True,True,False
2,15,1,1,1,2,0,4,3,2,2,...,True,False,True,False,False,False,True,True,True,False
3,15,4,2,1,3,0,3,2,2,1,...,True,False,False,True,False,True,True,True,True,True
4,16,3,3,1,2,0,4,3,2,1,...,False,False,False,True,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,19,2,3,1,3,1,5,4,2,1,...,True,False,False,False,False,True,False,True,True,False
645,18,3,1,1,2,0,4,3,4,1,...,True,False,False,True,False,False,True,True,True,False
646,18,1,1,2,2,0,1,1,1,1,...,True,False,False,False,False,True,True,True,False,False
647,17,3,1,2,1,0,2,4,5,3,...,True,False,False,False,False,False,False,True,True,False


G1 Model - MSE: 6.61, R²: 0.21
G2 Model - MSE: 7.99, R²: 0.19
G3 Model - MSE: 8.46, R²: 0.23
G1 Random Forest Model - MSE: 6.70, R²: 0.20
G2 Random Forest Model - MSE: 7.68, R²: 0.22
G3 Random Forest Model - MSE: 8.74, R²: 0.21


In [None]:
all_combinations = []
for i in range(7, len(X_train.columns) + 1):
    all_combinations += list(combinations(X_train.columns, i))

model_results = {}
for combination in all_combinations:
    combination_name = ', '.join(combination)
    X_train_subset = X_train[list(combination)]
    X_test_subset = X_test[list(combination)]
    model_results[combination_name] = {}
    for model_name, model in models.items():
        reg_pipeline = Pipeline([
            ("numerical_imputer", SimpleImputer(strategy='mean')),
            ("model", model)
        ])
        reg_pipeline.fit(X_train_subset, y_train)
        y_pred = reg_pipeline.predict(X_test_subset)
        test_score = r2_score(y_test, y_pred)
        model_results[combination_name][model_name] = test_score
results = pd.DataFrame(model_results)
results.index.name = 'Modelo'
results.columns.name = 'Combinación de Características'
results = results.stack().reset_index()
results.columns = ['Modelo', 'Combinación de Características', 'Puntaje R^2']
results.sort_values('Puntaje R^2', ascending=False).head(15)