In [1]:
import pandas as pd
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn import metrics   #Additional scklearn functions
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import math
from IPython.display import display
import os


X_train = pd.read_csv("train_test_data/X_train.csv", index_col="uid").drop("Unnamed: 0", axis="columns")
X_test = pd.read_csv("train_test_data/X_test.csv", index_col="uid").drop("Unnamed: 0", axis="columns")
y_test = pd.read_csv("train_test_data/y_test.csv", index_col="uid").drop("Unnamed: 0", axis="columns")
y_train = pd.read_csv("train_test_data/y_train.csv", index_col="uid").drop("Unnamed: 0", axis="columns")

In [2]:
# We only consider the post scores
class_names = [col for col in y_test.columns if "post" in col]
# Use the wk 9 to 10 training data only
used_weeks = [str(wk) for wk in range(9,11)]
filtered_columns = X_train.columns[X_train.columns.str.endswith(tuple(used_weeks))]
# Select all wk 9 and 10 features
filtered_features = X_train[filtered_columns]
train_data = {"class": {}, "regr": {}}
test_data = {"class": {}, "regr": {}}
# Prepare training and test data for classification model and regression model respectively
for name in class_names:
    # Classification labels
    if "class" in name:
        target_type = "class"
    else:
        target_type = "regr"
    train_data[target_type][name] = {"x": {}, "y": {}}
    test_data[target_type][name] = {"x": {}, "y": {}}
    # Only consider data which has the post score being not null
    not_null_indices = np.logical_not(y_train.loc[:, name].isnull())
    train_data[target_type][name]["x"] = filtered_features[not_null_indices]
    train_data[target_type][name]["y"] = y_train[not_null_indices].loc[:, name]
    test_data[target_type][name]["x"] = X_test[filtered_columns]
    test_data[target_type][name]["y"] = y_test.loc[:, name]

In [3]:
# Function to find the best hyperparameters for each model
def find_best_model(rscv, x, y):
    best_model = rscv.fit(x, y)
    print("processing")
    # Print out the best parameters for the model
    print()
    print("Optimal parameters: {", end="")
    print("Learning rate: ", best_model.best_estimator_.get_params()["learning_rate"], end=", ")
    print("Gamma: ", best_model.best_estimator_.get_params()["gamma"], end=", ")
    print("Max depth: ", best_model.best_estimator_.get_params()["max_depth"], end=", ")
    print("Subsample: ", best_model.best_estimator_.get_params()["subsample"], end=", ")
    print("Max features at split: ", best_model.best_estimator_.get_params()["colsample_bytree"], end=", ")
    print("Min child weight: ",
          best_model.best_estimator_.get_params()["min_child_weight"], end=", ")
    print("Alpha: ", best_model.best_estimator_.get_params()["reg_alpha"], end=", ")
    print("Lambda: ", best_model.best_estimator_.get_params()["reg_lambda"], end=", ")
    print("Number of estimators (trees): ", best_model.best_estimator_.get_params()["n_estimators"], end="}\n\n")

    return best_model


In [4]:
def report_classifier_performance(model, x, y):
    predict = model.predict(x)
    predict_proba = model.predict_proba(x)[:, 1]
    print("Accuracy :", metrics.accuracy_score(y, predict))
    print("logloss:", metrics.log_loss(y, predict))
    print("AUC Score:", metrics.roc_auc_score(y, predict_proba))
    print()

    
# Classifier parameter grid
class_parameters = {"learning_rate": [0.1, 0.05, 0.01, 0.001], 
                    "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
                    "max_depth": [4, 5, 6, 7, 9],
                    "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
                    "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
                    "min_child_weight": [1, 3, 5, 7],
                    "n_estimators": [100, 250, 500, 1000, 1500, 2000]}

In [5]:
# Baseline classifier
baseline_classifier = XGBClassifier(objective='binary:logistic', scale_pos_weight=1)
xgb_rscv_classifier = RandomizedSearchCV(baseline_classifier, param_distributions=class_parameters,
                                         scoring="neg_log_loss", verbose=False, cv=5, random_state=2, n_jobs=4, n_iter=50)

# Find the optimal classifier and its performance on training and test set respectively
for name in train_data["class"]:
    print("Prediction of " + name)
    model = find_best_model(xgb_rscv_classifier, train_data["class"][name]["x"], train_data["class"][name]["y"])
    #model = XGBClassifier(objective="binary:logistic", learning_rate=0.01, gamma=1, max_depth=10, seed=27, scale_pos_weight=1,
    #                      subsample=0.5, colsample_bytree=1.0, min_child_weight=1, reg_alpha=0, reg_lambda=1, n_estimators=100)
    #model.fit(train_data["class"][name]["x"], train_data["class"][name]["y"])
    print("Performance on training set:")
    report_classifier_performance(model, train_data["class"][name]["x"], train_data["class"][name]["y"])
    print("Performance on test set:")
    report_classifier_performance(model, test_data["class"][name]["x"], test_data["class"][name]["y"])
    print("-------------------------------------------------------------------------------------------------\n")

Prediction of panas_pos_raw_class_post




processing

Optimal parameters: {Learning rate:  0.001, Gamma:  0.01, Max depth:  5, Subsample:  0.4, Max features at split:  0.8, Min child weight:  1, Alpha:  0, Lambda:  1, Number of estimators (trees):  1500}

Performance on training set:
Accuracy : 0.9655172413793104
logloss: 1.1909922894796794
AUC Score: 0.9952380952380953

Performance on test set:
Accuracy : 0.7777777777777778
logloss: 7.675372487472395
AUC Score: 0.8333333333333334

-------------------------------------------------------------------------------------------------

Prediction of panas_neg_raw_class_post




processing

Optimal parameters: {Learning rate:  0.001, Gamma:  2, Max depth:  5, Subsample:  0.7, Max features at split:  0.6, Min child weight:  1, Alpha:  0, Lambda:  1, Number of estimators (trees):  500}

Performance on training set:
Accuracy : 0.9
logloss: 3.4539309459864147
AUC Score: 0.9819004524886878

Performance on test set:
Accuracy : 0.4444444444444444
logloss: 19.188386796601534
AUC Score: 0.35000000000000003

-------------------------------------------------------------------------------------------------

Prediction of flourishing_scale_raw_class_post




processing

Optimal parameters: {Learning rate:  0.05, Gamma:  1, Max depth:  4, Subsample:  0.4, Max features at split:  0.8, Min child weight:  1, Alpha:  0, Lambda:  1, Number of estimators (trees):  1000}

Performance on training set:
Accuracy : 0.9642857142857143
logloss: 1.233527728389668
AUC Score: 1.0

Performance on test set:
Accuracy : 0.5555555555555556
logloss: 15.350833819103704
AUC Score: 0.6499999999999999

-------------------------------------------------------------------------------------------------

Prediction of panas_pos_imp_class_post




processing

Optimal parameters: {Learning rate:  0.05, Gamma:  1, Max depth:  9, Subsample:  0.2, Max features at split:  0.8, Min child weight:  1, Alpha:  0, Lambda:  1, Number of estimators (trees):  1000}

Performance on training set:
Accuracy : 0.9333333333333333
logloss: 2.302638399489392
AUC Score: 0.9955357142857143

Performance on test set:
Accuracy : 0.6666666666666666
logloss: 11.513103153288048
AUC Score: 0.7777777777777779

-------------------------------------------------------------------------------------------------

Prediction of panas_neg_imp_class_post


In [4]:
def report_regression_performance(model, x, y):
    predict = model.predict(x)
    print("RMSE :", math.sqrt(metrics.mean_squared_error(y, predict)))
    df = pd.DataFrame({'Actual': np.array(y).reshape(-1),
                       'Predicted': np.array(predict).reshape(-1)})
    display(df)


# Classifier parameter grid
regr_parameters = {"colsample_bytree": [0.3, 0.6, 0.8, 1.0],
                   "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
                   "min_child_weight": [1, 3, 5, 7],
                   "learning_rate": [0.1, 0.05, 0.01, 0.001],
                   "max_depth": [2, 4, 7, 10],
                   "n_estimators": [100, 250, 500, 1000, 1500, 2000],
                   'reg_alpha': [1e-5, 1e-2,  0.75],
                   'reg_lambda': [1e-5, 1e-2, 0.45],
                   "subsample": [0.2, 0.4, 0.5, 0.6, 0.7]}                                           

In [5]:
# Baseline regressor
baseline_regressor = XGBRegressor(objective="reg:squarederror")
xgb_rscv_regressor = RandomizedSearchCV(baseline_regressor, param_distributions=regr_parameters, 
                                        scoring="neg_mean_squared_error", iid=False, 
                                        cv=5, verbose=False, n_jobs=4, n_iter=50)

# Find the optimal classifier and its performance on training and test set respectively
for name in train_data["regr"]:
    print("Prediction of " + name)
    model = find_best_model(xgb_rscv_regressor, train_data["regr"][name]["x"], train_data["regr"][name]["y"])
    print("Performance on training set:")
    report_regression_performance(model, train_data["regr"][name]["x"], train_data["regr"][name]["y"])
    print("Performance on test set:")
    report_regression_performance(model, test_data["regr"][name]["x"], test_data["regr"][name]["y"])
    print("-------------------------------------------------------------------------------------------------\n")

Prediction of panas_pos_raw_post


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


processing

Optimal parameters: {Learning rate:  0.05, Gamma:  0.5, Max depth:  4, Subsample:  0.7, Max features at split:  0.3, Min child weight:  7, Alpha:  0.01, Lambda:  0.45, Number of estimators (trees):  100}

Performance on training set:
RMSE : 2.3006675958787994


Unnamed: 0,Actual,Predicted
0,31.0,31.879549
1,27.0,27.206074
2,28.0,29.08379
3,34.0,33.861973
4,32.0,32.215164
5,43.0,36.408585
6,36.0,34.397022
7,33.0,32.161957
8,17.0,20.840256
9,27.0,27.204685


Performance on test set:
RMSE : 5.632913053882458


Unnamed: 0,Actual,Predicted
0,25.0,27.333611
1,19.0,31.564211
2,33.0,29.877413
3,32.0,25.439102
4,29.0,31.865566
5,16.0,23.64431
6,26.0,26.522257
7,27.0,28.306042
8,28.0,27.083006


-------------------------------------------------------------------------------------------------

Prediction of panas_neg_raw_post


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


processing

Optimal parameters: {Learning rate:  0.1, Gamma:  0.01, Max depth:  4, Subsample:  0.6, Max features at split:  0.6, Min child weight:  1, Alpha:  0.01, Lambda:  0.01, Number of estimators (trees):  2000}

Performance on training set:
RMSE : 0.02940807049354463


Unnamed: 0,Actual,Predicted
0,24.0,23.947317
1,17.0,17.00555
2,13.0,12.987899
3,19.0,19.019075
4,12.0,12.022677
5,28.0,27.991869
6,12.0,12.009696
7,25.0,25.031002
8,33.0,32.989052
9,37.0,36.952763


Performance on test set:
RMSE : 9.812527853468195


Unnamed: 0,Actual,Predicted
0,20.0,26.502104
1,23.0,15.890419
2,17.0,31.627123
3,21.0,28.45372
4,15.0,13.785615
5,17.0,18.046316
6,39.0,17.690716
7,18.0,17.187283
8,22.0,15.150069


-------------------------------------------------------------------------------------------------

Prediction of flourishing_scale_raw_post


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


processing

Optimal parameters: {Learning rate:  0.01, Gamma:  2, Max depth:  2, Subsample:  0.4, Max features at split:  0.3, Min child weight:  7, Alpha:  0.01, Lambda:  1e-05, Number of estimators (trees):  2000}

Performance on training set:
RMSE : 5.530785636513608


Unnamed: 0,Actual,Predicted
0,45.0,47.054317
1,46.0,44.784611
2,44.0,46.402569
3,33.0,36.146645
4,47.0,47.316608
5,39.0,40.825245
6,53.0,46.421959
7,48.0,45.224552
8,41.0,42.807468
9,38.0,40.399616


Performance on test set:
RMSE : 7.024072606646081


Unnamed: 0,Actual,Predicted
0,31.0,37.527508
1,31.0,44.744766
2,50.0,43.52631
3,42.0,44.658203
4,48.0,46.874172
5,47.0,38.268341
6,37.0,45.02914
7,49.0,45.941948
8,41.0,44.494076


-------------------------------------------------------------------------------------------------

Prediction of panas_pos_imp_post


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


processing

Optimal parameters: {Learning rate:  0.05, Gamma:  1, Max depth:  7, Subsample:  0.4, Max features at split:  1.0, Min child weight:  1, Alpha:  0.01, Lambda:  1e-05, Number of estimators (trees):  2000}

Performance on training set:
RMSE : 0.28079316432878165


Unnamed: 0,Actual,Predicted
0,31.0,31.020729
1,27.0,27.192423
2,28.0,27.997889
3,34.0,34.166584
4,32.0,31.889477
5,43.0,42.614388
6,36.0,35.784153
7,33.0,32.783566
8,17.0,16.898182
9,27.0,27.046957


Performance on test set:
RMSE : 5.356261832250222


Unnamed: 0,Actual,Predicted
0,25.0,25.741905
1,19.0,32.573921
2,33.0,31.908583
3,32.0,26.999275
4,29.0,30.595886
5,16.0,21.486814
6,26.0,25.078253
7,27.0,30.088383
8,28.0,25.958933


-------------------------------------------------------------------------------------------------

Prediction of panas_neg_imp_post


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


processing

Optimal parameters: {Learning rate:  0.1, Gamma:  0.01, Max depth:  10, Subsample:  0.7, Max features at split:  0.3, Min child weight:  1, Alpha:  0.01, Lambda:  1e-05, Number of estimators (trees):  250}

Performance on training set:
RMSE : 0.03265488007271522


Unnamed: 0,Actual,Predicted
0,24.0,23.953562
1,17.0,16.974777
2,13.0,12.992293
3,19.0,19.064936
4,12.0,12.041485
5,28.0,27.985289
6,12.0,12.037071
7,25.0,25.00638
8,33.0,32.996155
9,37.0,36.930267


Performance on test set:
RMSE : 10.854650858578509


Unnamed: 0,Actual,Predicted
0,20.0,25.829168
1,23.0,17.350296
2,17.0,32.840187
3,21.0,29.54006
4,15.0,18.282812
5,17.0,17.976631
6,39.0,15.051043
7,18.0,17.123926
8,22.0,12.801208


-------------------------------------------------------------------------------------------------

Prediction of flourishing_scale_imp_post


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


processing

Optimal parameters: {Learning rate:  0.05, Gamma:  0.5, Max depth:  10, Subsample:  0.5, Max features at split:  0.3, Min child weight:  7, Alpha:  0.75, Lambda:  1e-05, Number of estimators (trees):  500}

Performance on training set:
RMSE : 2.309563156087042


Unnamed: 0,Actual,Predicted
0,45.0,46.579205
1,46.0,45.342564
2,44.0,44.611248
3,33.0,33.381367
4,47.0,47.16124
5,39.0,39.527966
6,53.0,50.251637
7,48.0,47.564167
8,41.0,41.864029
9,38.0,38.043617


Performance on test set:
RMSE : 6.842601943910931


Unnamed: 0,Actual,Predicted
0,31.0,37.425507
1,31.0,42.945602
2,50.0,46.617569
3,42.0,46.930843
4,48.0,46.114059
5,47.0,40.178093
6,37.0,47.012836
7,49.0,46.030804
8,41.0,47.517975


-------------------------------------------------------------------------------------------------

