# Modeling Pipeline
This notebook covers:
- Preprocessing
- Baseline model
- Model experiments
- Hyperparameter tuning
- Evaluation
- Submission file generation

In [29]:
# Add xgboost dependency for gradient boosting
!pip install xgboost

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

TRAIN_URL = "https://raw.githubusercontent.com/Arthur-IO/Data-Science-Final-Proj/master/data/train.csv"
TEST_URL = "https://raw.githubusercontent.com/Arthur-IO/Data-Science-Final-Proj/master/data/test.csv"
SUB_URL  = "https://raw.githubusercontent.com/Arthur-IO/Data-Science-Final-Proj/master/data/sample_submission.csv"

train = pd.read_csv(TRAIN_URL)
test = pd.read_csv(TEST_URL)
sample_sub = pd.read_csv(SUB_URL)

target = "Target"
X = train.drop(target, axis=1)
y = train[target]

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# find which columns are categories and which are numbers
categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude="object").columns

numerical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# combine boh pipelines so the model can use all features
preprocess = ColumnTransformer([
    ("num", numerical_pipe, numerical_cols),
    ("cat", categorical_pipe, categorical_cols)
])

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)




# Baseline Model: Logistic Regression


In [30]:
logreg = LogisticRegression(max_iter=1000, class_weight="balanced")
logreg.fit(preprocess.fit_transform(X_train), y_train)

y_pred = logreg.predict(preprocess.transform(X_valid))
y_proba = logreg.predict_proba(preprocess.transform(X_valid))

print(classification_report(y_valid, y_pred))

# calculate ROC-AUC score to measure how well the model separates classes
print("ROC-AUC (ovr):", roc_auc_score(y_valid, y_proba, multi_class="ovr"))

              precision    recall  f1-score   support

           0       0.91      0.78      0.84      5059
           1       0.55      0.72      0.62      2988
           2       0.89      0.85      0.87      7257

    accuracy                           0.80     15304
   macro avg       0.78      0.79      0.78     15304
weighted avg       0.83      0.80      0.81     15304

ROC-AUC (ovr): 0.9222451077417014


# Experiment with random forest and xgboost models


In [31]:
rf = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
rf.fit(preprocess.fit_transform(X_train), y_train)

from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
xgb.fit(preprocess.fit_transform(X_train), y_train)

models = {"LogReg": logreg, "RandomForest": rf, "XGB": xgb}
for name, model in models.items():
    y_proba = model.predict_proba(preprocess.transform(X_valid))
    print(name, "ROC-AUC (ovr):", roc_auc_score(y_valid, y_proba, multi_class="ovr"))

LogReg ROC-AUC (ovr): 0.9222451077417014
RandomForest ROC-AUC (ovr): 0.9307112225198733
XGB ROC-AUC (ovr): 0.9360948137212549


# Quick Checkpoint Summary

## Preprocessing
- Numerical features: median imputation + standard scaling
- Categorical features: most frequent imputation + one-hot encoding
- 80-20 train-val split with stratification

## Model Results
- **Logistic Regression**: 80% accuracy, 0.922 class separation score
- **Random Forest**: 0.931 class separation score
- **XGBoost**: 0.936 class separation score (best so far)

Tree models > linear models for this dataset. XGBoost currently winning. Class 1 (Enrolled) is the hardest to predict correctly.