# DAMO510 Module 05: In-Class Activity
This notebook follows the steps outlined in the instructions for working with categorical variables and predictive modeling.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

In [None]:
# Load the dataset
df_a = pd.read_csv('/mnt/data/student-por.csv', sep=';')
df_a.head()

In [None]:
# Data cleaning: drop duplicates and missing values
df_b = df_a.copy()
df_b = df_b.drop_duplicates()
df_b = df_b.dropna()
df_b.shape

In [None]:
# Define target and features (drop G1 and G2 to avoid leakage)
X = df_b.drop(['G3', 'G1', 'G2'], axis=1)
y = df_b['G3']

In [None]:
# Train-test split (no encoding yet)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train/test shapes:", X_train.shape, X_test.shape)

In [None]:
# Identify variable types
nominal_vars = [
    'school','sex','address','famsize','Pstatus','Mjob','Fjob','reason',
    'guardian','schoolsup','famsup','paid','activities','nursery','higher',
    'internet','romantic'
]
ordinal_vars = [
    'Medu','Fedu','traveltime','studytime','failures','famrel','freetime',
    'goout','Dalc','Walc','health'
]
numeric_vars = ['age', 'absences']


In [None]:
# Encoding categorical variables
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(X_train[nominal_vars])
X_train_nom = pd.DataFrame(
    ohe.transform(X_train[nominal_vars]),
    columns=ohe.get_feature_names_out(nominal_vars),
    index=X_train.index
)
X_test_nom = pd.DataFrame(
    ohe.transform(X_test[nominal_vars]),
    columns=ohe.get_feature_names_out(nominal_vars),
    index=X_test.index
)

ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[ordinal_vars])
X_train_ord = pd.DataFrame(
    ord_enc.transform(X_train[ordinal_vars]),
    columns=ordinal_vars, index=X_train.index
)
X_test_ord = pd.DataFrame(
    ord_enc.transform(X_test[ordinal_vars]),
    columns=ordinal_vars, index=X_test.index
)

X_train_num = X_train[numeric_vars]
X_test_num = X_test[numeric_vars]

# Combine all features
X_train_final = pd.concat([X_train_nom, X_train_ord, X_train_num], axis=1)
X_test_final = pd.concat([X_test_nom, X_test_ord, X_test_num], axis=1)

print("Final train/test shapes:", X_train_final.shape, X_test_final.shape)

In [None]:
# Correlation analysis
corr_matrix = df_b.corr()
corr_with_target = corr_matrix['G3'].sort_values(ascending=False)
print(corr_with_target)

In [None]:
# Linear regression with statsmodels
model = sm.OLS(y_train, sm.add_constant(X_train_final)).fit()
preds = model.predict(sm.add_constant(X_test_final))

r2 = r2_score(y_test, preds)
n = X_test_final.shape[0]
k = X_test_final.shape[1]
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"R2: {r2:.4f}")
print(f"Adjusted R2: {adj_r2:.4f}")
print(f"RMSE: {rmse:.4f}")

print(model.summary())

In [None]:
# ANOVA test
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)