# Student's performance analysis - trying to predict students' math performances through the data set

First, we load in the data and inspect it. 

In [23]:
import os

import numpy as np
import pandas as pd

data = pd.read_csv('Students Performance/StudentsPerformance.csv')

# Convert to categorical!
for i in range(data.shape[1]):
    if i not in [5,6,7]:
        data.iloc[:, i] = data.iloc[:,i].astype("category")

        
# Never mind. We should convert it with label encoders

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for i in range(data.shape[1]):
    if i not in [5,6,7]:
        data.iloc[:, i] = label_encoder.fit_transform(data.iloc[:,i])



data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,1,1,1,72,72,74
1,0,2,4,1,0,69,90,88
2,0,1,3,1,1,90,95,93
3,1,0,0,0,1,47,57,44
4,1,2,4,1,1,76,78,75


So this is the data. What do we do with it now? We could try a logistic regression analysis, since we have many classes. The classes are mostly binary or tertiary, apart from parental level of education. Random Forest and XGBoost are also great options, so I will implement these with cross-validation. For each score, I will create one of each model mentioned above and compute their MSE scores. But first, we divide into training and test data. 

In [24]:
from sklearn.model_selection import train_test_split 

train, test = train_test_split(data, random_state = 123)

print(train.shape)
print(test.shape)

col_names = list(data.columns)
# [word for word in query if word not in stop_words]


(750, 8)
(250, 8)


## Predicting Math score

First, we predict the math score. 

In [25]:
#math_pred_feats = [x for i, x in enumerate(col_names) if i in (list(range(0,5)) + list(range(6,8)))]
math_pred_feats = [x for i, x in enumerate(col_names) if i in list(range(0,5))]
math_pred_label = col_names[5]

train_math = train[math_pred_feats]
y_train_math = train[math_pred_label]
#print(train_math.head())
test_math = test[math_pred_feats]
y_test_math = test[math_pred_label]
#print(test_math.head())



from xgboost import XGBRegressor
import xgboost as xgb


xgb_model = XGBRegressor(max_depth = 5,
                n_estimators=500,
                nthread=4,
                subsample=1.0,
                colsample_bytree=0.7,
                seed=1302)
xgb_params = xgb_model.get_xgb_params()

xgb_model.fit(train_math, y_train_math, verbose = True)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=4, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1302,
       silent=True, subsample=1.0)

Now, we have a model. Now, we use that to predict

In [26]:
preds = xgb_model.predict(test_math)
from sklearn.metrics import mean_squared_error

print(np.sqrt(mean_squared_error(y_test_math,preds)))

14.635693790916656


Quite a high MSE. Let's see if we can reduce this by performing a grid search. 

In [27]:
from sklearn.grid_search import GridSearchCV

params = {
    'colsample_bytree':[0.4,0.7],
    'gamma':[0,0.01,0.5,0.9],
    'min_child_weight':[1,3],
    'learning_rate':[0.01,0.1,1],
    'max_depth':[3,4,5],
    'n_estimators':[500],
    'reg_alpha':[1e-5, 0.1],
    'reg_lambda':[1e-5, 0.1],
    'subsample':[0.8]
}

grid_search = GridSearchCV(estimator = xgb_model, param_grid = params,n_jobs=1, iid=False,verbose=True, scoring = 'neg_mean_squared_error')
print("Fitting model...")
grid_search.fit(train_math, y_train_math)
print("Model fitted")
#print("Scores")
#print(grid_search.grid_scores_)
print("Best score: ")
print(grid_search.best_score_)
print("Best model: ")
print(grid_search.best_params_)

Fitting model...
Fitting 3 folds for each of 576 candidates, totalling 1728 fits


[Parallel(n_jobs=1)]: Done 1728 out of 1728 | elapsed: 11.7min finished


Model fitted
Scores
[mean: -179.50580, std: 15.88163, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500, 'reg_alpha': 1e-05, 'reg_lambda': 1e-05, 'subsample': 0.8}, mean: -179.57240, std: 15.90126, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500, 'reg_alpha': 1e-05, 'reg_lambda': 0.1, 'subsample': 0.8}, mean: -179.51171, std: 15.89261, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500, 'reg_alpha': 0.1, 'reg_lambda': 1e-05, 'subsample': 0.8}, mean: -179.58358, std: 15.90825, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 0.8}, mean: -179.49748, std: 15.87985, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.0

In [28]:
print("Best score: ")
print(grid_search.best_score_)
print("Best model: ")
print(grid_search.best_params_)

Best score: 
-179.4907217389701
Best model: 
{'colsample_bytree': 0.4, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 500, 'reg_alpha': 0.1, 'reg_lambda': 1e-05, 'subsample': 0.8}


Now use the best model to fit, and check how much the MSE has been reduced. 

In [33]:
best_model = XGBRegressor(colsample_bytree=0.4,
    gamma=0.01,
    min_child_weight=3,
    learning_rate=0.01,
    max_depth=3,
    n_estimators=500,
    reg_alpha=0.1,
    reg_lambda=1e-5,
    subsample=0.8, seed = 123)
best_model.fit(train_math, y_train_math)

y_preds = best_model.predict(test_math)
print(mean_squared_error(y_test_math,y_preds))

181.49997657896094


Now, let's try randomforest. For speed, let's just do a quick Randomized search. 

In [48]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "max_depth":[3,4,5],
    "max_features":[3,4,5],
    "bootstrap":[True,False],
    "min_samples_split":[2,3,4,5,6]
}

rand_search = RandomizedSearchCV(rf_model, param_distributions = param_dist, n_jobs=1, cv = 5, verbose = True, n_iter=90)
rand_search.fit(train_math, y_train_math)


Fitting 5 folds for each of 90 candidates, totalling 450 fits
RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=90, n_jobs=1,
          param_distributions={'max_depth': [3, 4, 5], 'max_features': [3, 4, 5], 'bootstrap': [True, False], 'min_samples_split': [2, 3, 4, 5, 6]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=True)


[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    7.7s finished


In [54]:
print(rand_search.best_score_)
print(rand_search.best_params_)

0.18036090911335423
{'min_samples_split': 3, 'max_features': 5, 'max_depth': 3, 'bootstrap': True}


Now use the best model!

In [56]:
best_rf_model = RandomForestRegressor(min_samples_split=3,max_features=5,max_depth=3, bootstrap=True)
best_rf_model.fit(train_math,y_train_math)

rf_preds = best_rf_model.predict(test_math)
print(mean_squared_error(y_test_math,y_preds))

181.49997657896094


Not very good either. Better can be achieved. 

In [57]:
print(best_rf_model)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
