# **STUDENT AI** - MATH MODEL CREATION (CLASSIFICATION)

## Objectives

Output a usable model for predicting student performance for use in a streamlit dashboard.

## Inputs

Main Data set cleaned and engineered for model training.

## Outputs

Pipeline and .pkl file to use for predicting a students math score based on the derived calculated best feature variables.


---

# Import required libraries

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

### Pipeline
from sklearn.pipeline import Pipeline

### Feature Engineering
from feature_engine.encoding import OrdinalEncoder

### Feature Scaling
from sklearn.preprocessing import StandardScaler

### libraries for custom transformer
from sklearn.base import BaseEstimator, TransformerMixin

### Feature Balancing
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

### Feature  Selection
from sklearn.feature_selection import SelectFromModel

### ML algorithms 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

### EqualFrequencyDiscretiser
from feature_engine.discretisation import EqualFrequencyDiscretiser

### packages for classification report and confusion matrix
from sklearn.metrics import make_scorer, recall_score

### Train test split
from sklearn.model_selection import train_test_split

### Packages for generating a classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix

### GridSearchCV
from sklearn.model_selection import GridSearchCV

print('All Libraries Loaded')

All Libraries Loaded


  from pandas import MultiIndex, Int64Index


# Change working directory

### Set the working directory to notebook parent folder
If the output does not match, click **'clear all outputs'** and then **'restart'** the notebook. 
Then run cells from top to bottom.

In [2]:
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
print('If correct, Active Directory should read: /workspace/student-AI')
print(f"Active Directory: {current_dir}")

If correct, Active Directory should read: /workspace/student-AI
Active Directory: /workspace/student-AI


### Load cleaned dataset

In [None]:
df_maths = pd.read_csv('outputs/dataset/Expanded_data_with_more_features_clean.csv').filter(['Gender', 'EthnicGroup', 'ParentEduc', 'LunchType', 'TestPrep', 'MathScore'])
df_maths

In [None]:
efd = EqualFrequencyDiscretiser(q=2, variables=['MathScore'])
df_maths_efd = efd.fit_transform(df_maths)

print(f"* The classes represent the following ranges: \n{efd.binner_dict_} \n")
fig_maths_efd = sns.countplot(data=df_maths_efd, x='MathScore')
plt.bar_label(fig_maths_efd.containers[0])
plt.show()

In [None]:
df_maths_efd

### Split Data Set into Train and Test sets

In [None]:
 math_train_features, math_test_features, math_train_scores, math_test_scores = train_test_split(
    df_maths_efd.drop(['MathScore'], axis=1),
    df_maths_efd['MathScore'],
    test_size = 0.2,
    random_state = 7
)

print("* Train set:", math_train_features.shape, math_train_scores.shape, "\n* Test set:",  math_test_features.shape, math_test_scores.shape)

In [None]:
sns.set_style("whitegrid")
math_train_scores.value_counts().plot(kind='bar', title='Train Set Target Distribution')
plt.show()

In [None]:
feature_set = ['ParentEduc','EthnicGroup']

### Remove all other features...

In [None]:
math_train_features = math_train_features.filter(feature_set)
math_test_features = math_test_features.filter(feature_set)
math_test_features

In [None]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([

        ("OrdinalCategoricalEncoder", OrdinalEncoder(
            encoding_method='arbitrary', 
            variables=feature_set)),

        ("model", model),

    ])

    return pipeline_base

### Define best algorithm and hyperparameters

In [None]:
selected_model = {"AdaBoostClassifier": AdaBoostClassifier(random_state=0),}
selected_model_parameters = {"AdaBoostClassifier": {
    'model__n_estimators': [80],
    'model__learning_rate':[0.1],
    }
  }

selected_model_parameters

### Final GridSearch CV

In [None]:
from sklearn.model_selection import GridSearchCV
class HyperparameterOptimizationSearch:

    def __init__(self, models, parameters):
        self.models = models
        self.parameters = parameters
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, x, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model =  PipelineOptimization(self.models[key])

            parameters = self.parameters[key]
            grid_search = GridSearchCV(model, parameters, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring)
            grid_search.fit(x, y)
            self.grid_searches[key] = grid_search

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, parameters):
            summary = {
                 'estimator': key,
                 'minimum_score': min(scores),
                 'maximum_score': max(scores),
                 'mean_score': np.mean(scores),
                 'standard_deviation_score': np.std(scores),
            }
            return pd.Series({**parameters,**summary})

        rows = []
        for k in self.grid_searches:
            parameters = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                result = self.grid_searches[k].cv_results_[key]        
                scores.append(result.reshape(len(parameters), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(parameters, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'minimum_score', 'mean_score', 'maximum_score', 'standard_deviation_score']
        columns = columns + [column for column in df.columns if column not in columns]

        return df[columns], self.grid_searches

In [None]:
math_model = HyperparameterOptimizationSearch(models=selected_model, parameters=selected_model_parameters)
math_model.fit(math_train_features, math_train_scores,
           scoring = make_scorer(recall_score, labels=[0], average=None),
           n_jobs=-1,cv=5)


### Custom Confusion Matrix Function from CI Customer Churn Course

In [None]:
def confusion_matrix_and_report(x, y, pipeline, label_map):

  prediction = pipeline.predict(x)

  print('---  Confusion Matrix  ---')
  print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
        columns=[ ["Actual " + sub for sub in label_map] ], 
        index= [ ["Prediction " + sub for sub in label_map ]]
        ))
  print("\n")

  print('---  Classification Report  ---')
  print(classification_report(y, prediction, target_names=label_map),"\n")

def clf_performance(math_train_features, math_train_scores, math_test_features, math_test_scores, pipeline, label_map):
  print("#### Train Set #### \n")
  confusion_matrix_and_report(math_train_features, math_train_scores, pipeline, label_map)

  print("#### Test Set ####\n")
  confusion_matrix_and_report(math_test_features, math_test_scores, pipeline, label_map)

In [None]:
grid_search_summary_final, grid_search_pipelines_final = math_model.score_summary()
grid_search_summary_final

In [None]:
final_model = grid_search_summary_final.iloc[0,0]
final_model

In [None]:
pipeline_clf_final = grid_search_pipelines_final[final_model].best_estimator_
pipeline_clf_final

In [None]:
data_cleaning_feat_eng_steps = 1
columns_after_data_cleaning_feat_eng = (Pipeline(pipeline_clf_final.steps[:data_cleaning_feat_eng_steps])
                                        .transform(math_train_features)
                                        .columns)

# create DataFrame to display feature importance
df_feature_importance_final = (pd.DataFrame(data={
          'Feature': columns_after_data_cleaning_feat_eng,
          'Importance': pipeline_clf_final['model'].feature_importances_})
  .sort_values(by='Importance', ascending=False)
  )

best_features_final = df_feature_importance_final['Feature'].to_list() # reassign best features in order

# Most important features statement and plot
print(f"* These are the {len(best_features_final)} most important features in descending order. "
      f"The model was trained on them: \n{best_features_final}")

df_feature_importance_final.plot(kind='bar', x='Feature', y='Importance')
plt.show()

In [None]:
def confusion_matrix_and_report(x, y, pipeline, label_map):

  prediction = pipeline.predict(x)

  print('---  Confusion Matrix  ---')
  print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
        columns=[ ["Actual " + sub for sub in label_map] ], 
        index= [ ["Prediction " + sub for sub in label_map ]]
        ))
  print("\n")

  print('---  Classification Report  ---')
  print(classification_report(y, prediction, target_names=label_map),"\n")

def clf_performance(math_train_features, math_train_scores, math_test_features, math_test_scores, pipeline, label_map):
  print("#### Train Set #### \n")
  confusion_matrix_and_report(math_train_features, math_train_scores, pipeline, label_map)

  print("#### Test Set ####\n")
  confusion_matrix_and_report(math_test_features, math_test_scores, pipeline, label_map)

In [None]:
label_map = ['might need assistance', 'will not need assistance']

In [None]:
clf_performance(math_train_features, math_train_scores, math_test_features, math_test_scores, pipeline_clf_final, label_map )


# Analysis
This is a good verification that reducing the dataset to only the feature variables had no effect on the end result. I can go ahead and save the model for use in the dashboard.

## Create Final Model Files

#### Check variable content before saving

In [None]:
# math_train_features.head()
# math_train_scores.head()
# math_test_features.head()
# math_test_scores.head()

### Set Model Type / Version and destination

In [None]:
version = "v1"
target = "math"

file_path = f"outputs/models/{target}/{version}"
file_path

### Output files

In [None]:
# Define which files to save/update
file_names = [
    f"{target}-train-features.csv",
    f"{target}-train-scores.csv",
    f"{target}-test-features.csv",
    f"{target}-test-scores.csv",
    f"{target}-model.pkl",
    f"{target}-labels.pkl",
    f"{target}-feature-importance.png",
]

# Ensure the directory exists
os.makedirs(file_path, exist_ok=True)

# Remove files if they exist
print(f'*** Removing previous files  ***')
for name in file_names:
    file = os.path.join(file_path, name)
    if os.path.exists(file):
        os.remove(file)
        print(f"{file} removed")
    else:
        print(f"{file} does not exist")

print('')

# Save .csv files
print(f'*** Creating files in: {file_path} ***')

math_train_features.to_csv(f"{file_path}/{target}-train-features.csv", index=False)
print(f'{target}-train-features.csv created')

math_train_scores.to_csv(f"{file_path}/{target}-train-scores.csv", index=False)
print(f'{target}-train-scores.csv created')

math_test_features.to_csv(f"{file_path}/{target}-test-features.csv", index=False)
print(f'{target}-test-features.csv created')

math_test_scores.to_csv(f"{file_path}/{target}-test-scores.csv", index=False)
print(f'{target}-test-scores.csv created')

# Save .pkl model files
joblib.dump(value=pipeline_clf_final, filename=f"{file_path}/{target}-model.pkl")
print(f'{target}_model.pkl created')

joblib.dump(value=label_map, filename=f"{file_path}/{target}-labels.pkl")
print(f'{target}_labels created')

# Save features plot image
df_feature_importance_final.plot(kind='bar',x='Feature',y='Importance')
plt.savefig(f'{file_path}/{target}-feature-importance.png', bbox_inches='tight')

