The objective of this model is to be able to predict the GradeClass.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow
import pickle

In [None]:
data = pd.read_csv('data/Student_performance_data.csv')
data.head()

# Exploratory Data Analysis


First of all, I will split the data in 3: training, validation and test.

In [None]:
X = data.drop(['GPA', 'GradeClass'], axis=1)
y = data[['GradeClass']]

In [None]:
X_train, X_val_test, y_train, y_val_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(
    X_val_test, y_val_test, test_size=0.5, random_state=42, stratify=y_val_test)

Notes about the columns:

- The StudentID will be removed to avoid biases in the grade calculation.
- Age, Gender and Ethnicity will be dropped to avoid bias, but this must be consider in the posterior evaluation and to comply with the Responsible AI.

- StudyTimeWeekly and Absences need to be scaled. I will check if they are normally distributed.


Extra considerations:

- For the preprocessing, I will expect a dictionary with all the columns presented in the original dataframe, even though several of them will be dropped.
- Non missing values are expected.


## Preprocessing

Conclusions for the features :

- ParentalEducation and ParentalSupport are ordinal categorical variables and will be transformed with a MinMaxScaler.
- Since StudyTimeWeekly and Absences are more similar to uniform distribution than to a normal one, I will use the MinMaxScaler for both of them.

I will convert GradeClass through OneHotEncoding

In [None]:
data.columns

In [None]:
experiment_name = 'student-performance'
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment(experiment_name)

In [None]:
experiment_id = [experiment.experiment_id for experiment in mlflow.search_experiments() 
                 if experiment.name == 'student-performance']

In [None]:
from sklearn.preprocessing import MinMaxScaler


minmax_cols = ['ParentalEducation', 'StudyTimeWeekly',
                'Absences', 'ParentalSupport']
sc = MinMaxScaler()

x_sc_train = sc.fit_transform(X_train.loc[:, minmax_cols])
X_train.loc[:, minmax_cols] = x_sc_train

x_sc_val = sc.transform(X_val.loc[:, minmax_cols])
X_val.loc[:, minmax_cols] = x_sc_val

x_sc_test = sc.transform(X_test.loc[:, minmax_cols])
X_test.loc[:, minmax_cols] = x_sc_test
    
with open('minmax_scaler.bin', 'wb') as f_out:
    pickle.dump(sc, f_out)

We don't need for now this transformation due to the models we are going to use.


In [None]:
y_train.hist()

from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder(sparse_output=False)
y_train = ohe.fit_transform(y_train)
y_val = ohe.transform(y_val)
y_test = ohe.transform(y_test)

In [None]:
X_train.isna().sum()


In [None]:
data.StudyTimeWeekly.hist()

In [None]:
data.Absences.hist()

In [None]:
cat_columns = ['ParentalEducation', 'Tutoring', 'ParentalSupport',
                'Extracurricular', 'Sports', 'Music', 'Volunteering']


for col in cat_columns: 
    data[[col]].hist()

In [None]:
X_train.ParentalEducation.hist()

In [None]:
columns_to_drop = ['StudentID', 'Age', 'Gender', 'Ethnicity']


In [None]:
X_train.drop(columns_to_drop, axis=1, inplace=True)
X_val.drop(columns_to_drop, axis=1, inplace=True)
X_test.drop(columns_to_drop, axis=1, inplace=True)

# Model Training

In [None]:
import scipy.stats as stats

from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_dists = [{
    'clf': [SVC()],
    'clf__C': stats.loguniform(1e-2, 1e3)
},
{
    'clf': [DecisionTreeClassifier()],
    'clf__criterion': ['gini','entropy'],
    'clf__splitter': ['best','random'],
    'clf__class_weight':['balanced', None]
},
{
    "clf": [SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True)],
    "clf__average": [True, False],
    "clf__l1_ratio": stats.uniform(0, 1),
    "clf__alpha": stats.loguniform(1e-2, 1e0),
}, 
{'clf': [RandomForestClassifier()],
 'clf__bootstrap': [True, False],
 'clf__max_depth': [10, 20, 30, None],
 'clf__min_samples_leaf': [1, 2, 4],
 'clf__min_samples_split': [2, 5, 10],
 'clf__n_estimators': stats.randint(20, 100)}]

In [None]:
n_iter_search = 10
n_cv = 5

results = {}

for param_dist in param_dists:
    print("-------------- " + str(param_dist) + " --------------")
    clf = param_dist['clf'][0]
    
    param_dist.pop('clf')
    steps = [('clf', clf)]
    random_search = RandomizedSearchCV(
        Pipeline(steps), param_distributions=param_dist,
          cv=n_cv, n_iter=n_iter_search, scoring='f1_macro'
    )
    random_search.fit(X_train, y_train.values.ravel())
    train_score = random_search.best_score_
    val_score = random_search.score(X_val, y_val.values.ravel())
    estimator_name = random_search.best_estimator_.steps[0][1].__class__
    results[estimator_name] = {'estimator': random_search.best_estimator_,
                               'mean_train_score': train_score, 'val_score': val_score}


In [None]:
pd.DataFrame(results)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score , f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
 
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('student-performance')

In [None]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'svm',
        'C': hp.lognormal('SVM_C', 0, 1.0)
    },
    {
        'type': 'rf',
        'max_depth': hp.randint('rf_max_depth', 5, 100),
        'criterion': hp.choice('rf_criterion', ['gini', 'entropy'])
    },
    {
        'type': 'dt',
        'criterion': hp.choice('dt_criterion', ['gini','entropy']),
        'splitter': hp.choice('dt_splitter', ['best','random']),
        'class_weight':hp.choice('dt_class_weight', ['balanced', None])
    },
    {
        'type': 'xgb',
        'learning_rate': hp.choice('xgb_learning_rate', [0.0005,0.001, 0.01, 0.5, 1]),
        'max_depth' : hp.choice('xgb_max_depth', range(3,21,3)),
        'gamma' : hp.choice('xgb_gamma', [i/10.0 for i in range(0,5)]),
        'colsample_bytree' : hp.choice('xgb_colsample_bytree', [i/10.0 for i in range(3,10)]),     
        'reg_alpha' : hp.choice('xgb_reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
        'reg_lambda' : hp.choice('xgb_reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
        'seed': hp.choice('xgb_seed', [0,7,42])
    }
])

In [None]:
def objective (params):
    with mlflow.start_run(nested=True):
        print(params)
        classifier_type = params['type']
        mlflow.set_tag("model", classifier_type)
        del params['type']
        if classifier_type == 'svm':
            clf = SVC(**params)
        elif classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
        elif classifier_type == 'dt':
            clf = DecisionTreeClassifier(**params)
        elif classifier_type == 'xgb':
            clf = XGBClassifier(**params)
        else:
            return 0
        mlflow.log_params(params)

        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        mlflow.log_metric("accuracy", accuracy)
        f1 = f1_score(y_val, y_pred,  average='macro')
        mlflow.log_metric("f1_score", f1)

        if getattr(clf, 'predict_proba', None):
            y_pred_proba = clf.predict_proba(X_val)
            roc_auc = roc_auc_score(y_val, y_pred_proba, average='micro', multi_class='ovr')
            mlflow.log_metric("roc_auc", roc_auc)

        mlflow.sklearn.log_model(
            sk_model=clf,
            artifact_path="mlruns"
        )
        mlflow.log_artifact(local_path="minmax_scaler.bin", artifact_path="minmax_scaler")

    return {'loss': -f1, 'status': STATUS_OK } 

In [None]:
algo=tpe.suggest

with mlflow.start_run(nested=True):
    best_result = fmin(
      fn=objective, 
      space=search_space,
      algo=algo,
      max_evals=32,
      trials=Trials())

In [None]:
import hyperopt


print(hyperopt.space_eval(search_space, best_result))

In [None]:
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

In [None]:
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
run = client.search_runs(
  experiment_ids=experiment_id,
  run_view_type=ViewType.ACTIVE_ONLY,
  order_by=["metrics.accuracy DESC"]
)[0]
run

In [None]:
run_id = run.info.run_id

mlflow.register_model(
    model_uri=f"runs:/{run_id}/models",
    name=experiment_name
)

In [None]:
model_uri = f"runs:/{run_id}/model"

model_src = RunsArtifactRepository.get_underlying_uri(model_uri)
filter_string = "run_id='{}'".format(run_id)
results = client.search_model_versions(filter_string)
model_version = results[0].version
model_version

In [None]:
new_stage = "Production"
client.transition_model_version_stage(
    name=experiment_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)