In this notebook, we will use xgboost to train a binary classification model to predict y given the course data

In [94]:
# %conda install xgboost

In [95]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [96]:
df = pd.read_csv("../Data/final_dataset_Nov_12.csv")
courses = ['101',
 '104',
 '105',
 '140',
 '143',
 '145',
 '150',
 '151',
 '160',
 '165',
 '166',
 '201',
 '207',
 '240',
 '265',
 '266',
 '267',
 '301',
 '302',
 '304',
 '314',
 '317',
 '341',
 '342',
 '350',
 '365',
 '373',
 '385',
 '397',
 '414',
 '415',
 '435',
 '436',
 '497']

#loop through the semesters
for i in range(1, 9):
    # Create a new column for the semester
    df[f'SEM_{i}'] = df[courses].apply(lambda x: x.tolist().count(i) - x.tolist().count(-i), axis=1)

# Create cumulative sum columns for each semester. This will give us the total number of courses passed by the student up to that semester. In particular, the final column will give us the total number of courses passed by the student within 8 semesters.
df[[f'SEM_{i}_cumulative' for i in range(1, 9)]] = df[[f'SEM_{i}' for i in range(1, 9)]].cumsum(axis=1)


#define X to be df without the y column and STUDENT column
X = df.drop(columns=['y', 'STUDENT','GRAD_SEM','SEM_1_cumulative'])
y=df['y']

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y,
                                                    random_state=42)

In [98]:
#output for sem_i and sem_i_cumulative

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
    'n_estimators': [100],  # Changed from 'num_boost_round' to 'n_estimators'
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
}

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform randomized search
random_search = RandomizedSearchCV(estimator=xgb_model, 
                                   param_distributions=param_grid, 
                                   n_iter=500, 
                                   scoring='accuracy', 
                                   verbose=1, 
                                   n_jobs=-1)
random_search.fit(X_train, y_train)


# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)


Fitting 5 folds for each of 500 candidates, totalling 2500 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'subsample': 0.6, 'objective': 'binary:logistic', 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.2, 'lambda': 1.5, 'gamma': 0, 'eval_metric': 'logloss', 'colsample_bytree': 0.8, 'alpha': 0.2}
Best accuracy found:  0.6553189160915042


In [99]:
#output for sem_i

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
    'n_estimators': [100],  # Changed from 'num_boost_round' to 'n_estimators'
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
}

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform randomized search
random_search = RandomizedSearchCV(estimator=xgb_model, 
                                   param_distributions=param_grid, 
                                   n_iter=500, 
                                   scoring='accuracy', 
                                   verbose=1, 
                                   n_jobs=-1)
random_search.fit(X_train.drop(columns=[f'SEM_{i}_cumulative' for i in range(2,9)]), y_train)


# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)


Fitting 5 folds for each of 500 candidates, totalling 2500 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'subsample': 1.0, 'objective': 'binary:logistic', 'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.2, 'lambda': 1.5, 'gamma': 0.2, 'eval_metric': 'logloss', 'colsample_bytree': 0.8, 'alpha': 0.1}
Best accuracy found:  0.6579069154905282


In [80]:
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_objective,param_n_estimators,param_min_child_weight,param_max_depth,param_learning_rate,...,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.196911,0.011140,0.014549,0.004607,0.6,binary:logistic,100,3,6,0.01,...,0.0,"{'subsample': 0.6, 'objective': 'binary:logist...",0.633442,0.626906,0.606209,0.629085,0.627248,0.624578,0.009475,396
1,0.111196,0.018554,0.014844,0.005814,1.0,binary:logistic,100,5,3,0.20,...,0.2,"{'subsample': 1.0, 'objective': 'binary:logist...",0.656863,0.655773,0.635621,0.644880,0.662670,0.651162,0.009666,86
2,0.111486,0.008059,0.017822,0.004035,0.8,binary:logistic,100,3,6,0.20,...,0.1,"{'subsample': 0.8, 'objective': 'binary:logist...",0.660131,0.653050,0.631264,0.649782,0.657221,0.650289,0.010146,119
3,0.119280,0.015534,0.015074,0.004758,0.6,binary:logistic,100,5,6,0.10,...,0.1,"{'subsample': 0.6, 'objective': 'binary:logist...",0.662309,0.648693,0.635621,0.642702,0.667030,0.651271,0.011792,80
4,0.094043,0.014125,0.017434,0.006286,1.0,binary:logistic,100,3,3,0.10,...,0.0,"{'subsample': 1.0, 'objective': 'binary:logist...",0.655229,0.654684,0.633442,0.631808,0.661580,0.647349,0.012275,260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.120761,0.009861,0.013744,0.001994,0.8,binary:logistic,100,5,3,0.10,...,0.0,"{'subsample': 0.8, 'objective': 'binary:logist...",0.656318,0.652505,0.630719,0.626906,0.660490,0.645388,0.013820,309
496,0.129776,0.024262,0.018041,0.006775,1.0,binary:logistic,100,3,3,0.01,...,0.1,"{'subsample': 1.0, 'objective': 'binary:logist...",0.592048,0.592048,0.576797,0.596405,0.593460,0.590152,0.006864,492
497,0.233304,0.071253,0.055380,0.051711,0.8,binary:logistic,100,3,6,0.01,...,0.0,"{'subsample': 0.8, 'objective': 'binary:logist...",0.632898,0.625272,0.594771,0.628540,0.620163,0.620329,0.013439,437
498,0.364114,0.054710,0.075608,0.079793,0.6,binary:logistic,100,1,10,0.10,...,0.2,"{'subsample': 0.6, 'objective': 'binary:logist...",0.651416,0.643791,0.635621,0.635621,0.656676,0.644625,0.008416,315


In [100]:
#output for sem_i_cumulative
# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
    'n_estimators': [100],  # Changed from 'num_boost_round' to 'n_estimators'
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
}

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform randomized search
random_search = RandomizedSearchCV(estimator=xgb_model, 
                                   param_distributions=param_grid, 
                                   n_iter=500, 
                                   scoring='accuracy', 
                                   verbose=1, 
                                   n_jobs=-1)
random_search.fit(X_train.drop(columns=[f'SEM_{i}' for i in range(2,9)]), y_train)


# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Best parameters found:  {'subsample': 0.6, 'objective': 'binary:logistic', 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.1, 'lambda': 2, 'gamma': 0, 'eval_metric': 'logloss', 'colsample_bytree': 1.0, 'alpha': 0.1}
Best accuracy found:  0.6553193798075764


In [57]:
# Define your parameters without 'n_estimators'
params = {
    'objective': 'binary:logistic',
    'max_depth': 10,
    'learning_rate': 0.1,
    # other parameters
}

# Specify the number of boosting rounds
num_boost_round = 200

# Initialize KFold and DataFrame to store metrics
skf = StratifiedKFold(n_splits=5, 
           shuffle=True, 
           random_state=42)
metrics_df = pd.DataFrame(columns=['cv_round', 'boost_round', 'accuracy', 'precision', 'recall', 'f1_score'])

# Perform manual cross-validation
for cv_round, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    
    dtrain_cv = xgb.DMatrix(data=X_train_cv, label=y_train_cv)
    dtest_cv = xgb.DMatrix(data=X_test_cv, label=y_test_cv)
    
    # Initialize model to None
    model = None
    
    # Train the model for each boosting round
    for boost_round in range(1, num_boost_round + 1):
        model = xgb.train(params, dtrain_cv, num_boost_round=1, xgb_model=model)
        
        # Make predictions
        y_pred_prob = model.predict(dtest_cv)
        y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]

        # Calculate scoring metrics
        accuracy = accuracy_score(y_test_cv, y_pred)
        precision = precision_score(y_test_cv, y_pred)
        recall = recall_score(y_test_cv, y_pred)
        f1 = f1_score(y_test_cv, y_pred)
        
        # Store the metrics in the DataFrame
        new_row = pd.DataFrame([{
            'cv_round': cv_round + 1,
            'boost_round': boost_round,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }])
        metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

# Set multi-level index
metrics_df.set_index(['cv_round', 'boost_round'], inplace=True)

# Print the DataFrame
# print(metrics_df)

  metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)


In [58]:
#for each boosting round, calculate the average of each metric
avg_metrics = metrics_df.groupby('boost_round').mean()

# Print the average metrics
print(avg_metrics)

             accuracy  precision    recall  f1_score
boost_round                                         
1            0.553998   0.547863  0.993438  0.706219
2            0.564483   0.554298  0.984857  0.709327
3            0.566661   0.556890  0.964918  0.706134
4            0.590897   0.600320  0.816535  0.677219
5            0.625219   0.649177  0.692614  0.663383
...               ...        ...       ...       ...
196          0.654637   0.684682  0.667082  0.675670
197          0.654637   0.684681  0.667082  0.675671
198          0.654365   0.684246  0.667335  0.675580
199          0.654501   0.684327  0.667587  0.675752
200          0.654365   0.684152  0.667587  0.675666

[200 rows x 4 columns]


In [59]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
    'n_estimators': [200],  # Changed from 'num_boost_round' to 'n_estimators'
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
}

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform grid search without early stopping
grid_search = GridSearchCV(
    estimator=xgb_model, 
    param_grid=param_grid, 
    scoring='accuracy', 
    cv=5, 
    verbose=1
)
grid_search.fit(
    X_train, 
    y_train
)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

Fitting 5 folds for each of 6561 candidates, totalling 32805 fits


KeyboardInterrupt: 

In [35]:
# Define your parameters without 'n_estimators'
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.1,
    # other parameters
}

# Specify the number of boosting rounds
num_boost_round = 100

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)

# Train the model
bst = xgb.train(params, dtrain, num_boost_round=num_boost_round)

# Create the DMatrix for the test data
dtest = xgb.DMatrix(data=X_test)

# Make predictions
y_pred = bst.predict(dtest)

# Convert probabilities to binary predictions (0 or 1)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy:.4f}')

cm = confusion_matrix(y_test, y_pred_binary)
print('Confusion Matrix:')
print(cm)

report = classification_report(y_test, y_pred_binary)
print('Classification Report:')
print(report)


Accuracy: 0.6449
Confusion Matrix:
[[548 297]
 [355 636]]
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.65      0.63       845
           1       0.68      0.64      0.66       991

    accuracy                           0.64      1836
   macro avg       0.64      0.65      0.64      1836
weighted avg       0.65      0.64      0.65      1836

