In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.sklearn
import joblib

In [1]:
# 1. Load data
def load_data(path):
    return pd.read_csv(path)

In [None]:
# 2. Split data
def split_data(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 3. Train models
def train_models(X_train, y_train, X_test, y_test):
    models = {
        "LinearRegression": LinearRegression(),
        "DecisionTree": DecisionTreeRegressor(),
        "RandomForest": RandomForestRegressor(),
        "XGBoost": XGBRegressor()
    }

    scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        score = r2_score(y_test, preds)
        scores[name] = (score, model)

    return scores

In [None]:
# 4. Select best model
def get_best_model(scores_dict):
    best_model_name = max(scores_dict, key=lambda k: scores_dict[k][0])
    return best_model_name, scores_dict[best_model_name][1]

In [None]:
# 5. Tune best model using Optuna
def tune_model(best_model_name, X_train, y_train, X_test, y_test):
    def objective(trial):
        if best_model_name == "RandomForest":
            n_estimators = trial.suggest_int("n_estimators", 50, 300)
            max_depth = trial.suggest_int("max_depth", 2, 20)
            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
        elif best_model_name == "XGBoost":
            n_estimators = trial.suggest_int("n_estimators", 50, 300)
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            max_depth = trial.suggest_int("max_depth", 2, 20)
            model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
        else:
            return 0  # Skip tuning for others

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        return r2_score(y_test, preds)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    best_params = study.best_params

    # Retrain with best params
    if best_model_name == "RandomForest":
        best_model = RandomForestRegressor(**best_params)
    elif best_model_name == "XGBoost":
        best_model = XGBRegressor(**best_params)
    else:
        best_model = None

    if best_model:
        best_model.fit(X_train, y_train)
    return best_model, best_params

In [None]:
# 6. MLflow logging
def log_with_mlflow(model, model_name, score, params):
    with mlflow.start_run():
        mlflow.log_param("model_name", model_name)
        mlflow.log_params(params)
        mlflow.log_metric("r2_score", score)
        mlflow.sklearn.log_model(model, model_name)

In [None]:
# 7. Save model
def save_model(model, filename="best_model.pkl"):
    joblib.dump(model, filename)

In [None]:
# Full pipeline runner
def run_pipeline(data_path, target_column):
    df = load_data(data_path)
    X_train, X_test, y_train, y_test = split_data(df, target_column)

    model_scores = train_models(X_train, y_train, X_test, y_test)
    best_model_name, best_model = get_best_model(model_scores)
    best_score = model_scores[best_model_name][0]

    print(f"Best model before tuning: {best_model_name} - R²: {best_score:.4f}")

    if best_model_name in ["RandomForest", "XGBoost"]:
        tuned_model, best_params = tune_model(best_model_name, X_train, y_train, X_test, y_test)
        tuned_preds = tuned_model.predict(X_test)
        tuned_score = r2_score(y_test, tuned_preds)
        print(f"Tuned {best_model_name} R²: {tuned_score:.4f}")

        log_with_mlflow(tuned_model, best_model_name, tuned_score, best_params)
        save_model(tuned_model)
    else:
        log_with_mlflow(best_model, best_model_name, best_score, {})
        save_model(best_model)

In [None]:
run_pipeline("cleaned_student_data.csv", "target_column_name")