In [2]:
#import data handling libraries
import pandas as pd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [3]:
#read in the main dataset
df = pd.read_csv('../Data/final_dataset_Nov_9.csv')

df

Unnamed: 0,STUDENT,GRAD_SEM,101,101_SEM,104,104_SEM,105,105_SEM,140,140_SEM,...,414_SEM,415,415_SEM,435,435_SEM,436,436_SEM,497,497_SEM,y
0,0,8.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,8.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,5,0.0,0,0,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
4,7,7.0,0,0,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9174,13059,4.0,0,0,0,0,0,0,1,3,...,0,0,0,0,0,0,0,0,0,1
9175,13060,10.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9176,13061,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9177,13062,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
courses = ['101',
 '104',
 '105',
 '140',
 '143',
 '145',
 '150',
 '151',
 '160',
 '165',
 '166',
 '201',
 '207',
 '240',
 '265',
 '266',
 '267',
 '301',
 '302',
 '304',
 '314',
 '317',
 '341',
 '342',
 '350',
 '365',
 '373',
 '385',
 '397',
 '414',
 '415',
 '435',
 '436',
 '497']

course_SEM = [f'{x}_SEM' for x in courses]

X = df.drop(columns=['y','GRAD_SEM','STUDENT'])
y= df['y']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif
"""
This script performs feature selection and model evaluation using various classifiers with a pipeline approach.

The script includes the following steps:
1. Import necessary libraries and modules.
2. Define the number of features to select using SelectKBest.
3. Create a dictionary of classifiers, each with a pipeline that includes feature selection and the classifier.
4. Perform 5-fold cross-validation for each model.
5. Print the cross-validation results, including mean accuracy and standard deviation for each model.

Classifiers included:
- Linear Discriminant Analysis (LDA)
- Logistic Regression
- Linear Support Vector Classifier (SVC)
- Quadratic Discriminant Analysis (QDA)
- Polynomial LDA
- Polynomial Logistic Regression
- Gaussian Naive Bayes (GNB)
- K-Nearest Neighbors (KNN)
- Radial Basis Function (RBF) SVC
- Random Forest

Parameters:
- k (int): The number of features to select using SelectKBest.

Possible values for the parameter "scoring" in cross_val_score:
- 'accuracy'
- 'adjusted_rand_score'
- 'average_precision'
- 'balanced_accuracy'
- 'brier_score_loss'
- 'f1'
- 'f1_micro'
- 'f1_macro'
- 'f1_weighted'
- 'f1_samples'
- 'neg_log_loss'
- 'precision'
- 'recall'
- 'roc_auc'
- 'roc_auc_ovr'
- 'roc_auc_ovo'
- 'roc_auc_ovr_weighted'
- 'roc_auc_ovo_weighted'
- 'neg_mean_absolute_error'
- 'neg_mean_squared_error'
- 'neg_root_mean_squared_error'
- 'r2'
- 'explained_variance'
- 'max_error'
- 'neg_median_absolute_error'
- 'neg_mean_poisson_deviance'
- 'neg_mean_gamma_deviance'
"""
from sklearn.pipeline import Pipeline

# Define the number of features to select
k = 10


# Update classifiers to include feature selection
classifiers = {
    'lda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg': Pipeline([('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(max_iter=100000, random_state=42))]),
    'svc_linear': Pipeline([('select', SelectKBest(f_classif, k=k)), ('svc_linear', LinearSVC(dual='auto'))]),
    'qda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('qda', QuadraticDiscriminantAnalysis())]),
    'lda_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(penalty=None, max_iter=100000))]),
    'gnb': Pipeline([('select', SelectKBest(f_classif, k=k)), ('gnb', GaussianNB())]),
    'knn': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('knn', KNeighborsClassifier())]),
    'svc_rbf': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('svc', SVC(kernel='rbf'))]),
    'random_forest': Pipeline([('select', SelectKBest(f_classif, k=k)), ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))])
}

# Perform 5-fold cross-validation for each model
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train[courses], 
                             y_train, 
                             cv=5,
                             scoring='accuracy',
                             verbose=0,
                             n_jobs=-1)
    cv_results[model_name] = scores
    # print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

# Print the cross-validation results
for model_name, scores in cv_results.items():
    print(f"{model_name}: Mean accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


lda: Mean accuracy = 0.5800, Std = 0.0144
log_reg: Mean accuracy = 0.5799, Std = 0.0143
svc_linear: Mean accuracy = 0.5800, Std = 0.0144
qda: Mean accuracy = 0.5140, Std = 0.0275
lda_poly: Mean accuracy = 0.5957, Std = 0.0187
log_reg_poly: Mean accuracy = 0.5951, Std = 0.0192
gnb: Mean accuracy = 0.5247, Std = 0.0081
knn: Mean accuracy = 0.5392, Std = 0.0329
svc_rbf: Mean accuracy = 0.5808, Std = 0.0152
random_forest: Mean accuracy = 0.5792, Std = 0.0146


In [7]:
from sklearn.feature_selection import SelectKBest, f_classif
"""
This script performs feature selection and model evaluation using various classifiers with a pipeline approach.

The script includes the following steps:
1. Import necessary libraries and modules.
2. Define the number of features to select using SelectKBest.
3. Create a dictionary of classifiers, each with a pipeline that includes feature selection and the classifier.
4. Perform 5-fold cross-validation for each model.
5. Print the cross-validation results, including mean accuracy and standard deviation for each model.

Classifiers included:
- Linear Discriminant Analysis (LDA)
- Logistic Regression
- Linear Support Vector Classifier (SVC)
- Quadratic Discriminant Analysis (QDA)
- Polynomial LDA
- Polynomial Logistic Regression
- Gaussian Naive Bayes (GNB)
- K-Nearest Neighbors (KNN)
- Radial Basis Function (RBF) SVC
- Random Forest

Parameters:
- k (int): The number of features to select using SelectKBest.

Possible values for the parameter "scoring" in cross_val_score:
- 'accuracy'
- 'adjusted_rand_score'
- 'average_precision'
- 'balanced_accuracy'
- 'brier_score_loss'
- 'f1'
- 'f1_micro'
- 'f1_macro'
- 'f1_weighted'
- 'f1_samples'
- 'neg_log_loss'
- 'precision'
- 'recall'
- 'roc_auc'
- 'roc_auc_ovr'
- 'roc_auc_ovo'
- 'roc_auc_ovr_weighted'
- 'roc_auc_ovo_weighted'
- 'neg_mean_absolute_error'
- 'neg_mean_squared_error'
- 'neg_root_mean_squared_error'
- 'r2'
- 'explained_variance'
- 'max_error'
- 'neg_median_absolute_error'
- 'neg_mean_poisson_deviance'
- 'neg_mean_gamma_deviance'
"""
from sklearn.pipeline import Pipeline

# Define the number of features to select
k = 10


# Update classifiers to include feature selection
classifiers = {
    'lda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg': Pipeline([('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(max_iter=100000, random_state=42))]),
    'svc_linear': Pipeline([('select', SelectKBest(f_classif, k=k)), ('svc_linear', LinearSVC(dual='auto'))]),
    'qda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('qda', QuadraticDiscriminantAnalysis())]),
    'lda_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(penalty=None, max_iter=100000))]),
    'gnb': Pipeline([('select', SelectKBest(f_classif, k=k)), ('gnb', GaussianNB())]),
    'knn': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('knn', KNeighborsClassifier())]),
    'svc_rbf': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('svc', SVC(kernel='rbf'))]),
    'random_forest': Pipeline([('select', SelectKBest(f_classif, k=k)), ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))])
}

# Perform 5-fold cross-validation for each model
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train[course_SEM], 
                             y_train, 
                             cv=5,
                             scoring='accuracy',
                             verbose=0,
                             n_jobs=-1)
    cv_results[model_name] = scores
    # print(f"{model_name}: Mean accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")

# Print the cross-validation results
for model_name, scores in cv_results.items():
    print(f"{model_name}: Mean accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


lda: Mean accuracy = 0.5921, Std = 0.0060
log_reg: Mean accuracy = 0.5920, Std = 0.0069
svc_linear: Mean accuracy = 0.5921, Std = 0.0059
qda: Mean accuracy = 0.5313, Std = 0.0158
lda_poly: Mean accuracy = 0.5830, Std = 0.0099
log_reg_poly: Mean accuracy = 0.5827, Std = 0.0100
gnb: Mean accuracy = 0.5314, Std = 0.0151
knn: Mean accuracy = 0.5746, Std = 0.0115
svc_rbf: Mean accuracy = 0.5951, Std = 0.0067
random_forest: Mean accuracy = 0.5951, Std = 0.0072


In [8]:
from sklearn.feature_selection import SelectKBest, f_classif
"""
This script performs feature selection and model evaluation using various classifiers with a pipeline approach.

The script includes the following steps:
1. Import necessary libraries and modules.
2. Define the number of features to select using SelectKBest.
3. Create a dictionary of classifiers, each with a pipeline that includes feature selection and the classifier.
4. Perform 5-fold cross-validation for each model.
5. Print the cross-validation results, including mean accuracy and standard deviation for each model.

Classifiers included:
- Linear Discriminant Analysis (LDA)
- Logistic Regression
- Linear Support Vector Classifier (SVC)
- Quadratic Discriminant Analysis (QDA)
- Polynomial LDA
- Polynomial Logistic Regression
- Gaussian Naive Bayes (GNB)
- K-Nearest Neighbors (KNN)
- Radial Basis Function (RBF) SVC
- Random Forest

Parameters:
- k (int): The number of features to select using SelectKBest.

Possible values for the parameter "scoring" in cross_val_score:
- 'accuracy'
- 'adjusted_rand_score'
- 'average_precision'
- 'balanced_accuracy'
- 'brier_score_loss'
- 'f1'
- 'f1_micro'
- 'f1_macro'
- 'f1_weighted'
- 'f1_samples'
- 'neg_log_loss'
- 'precision'
- 'recall'
- 'roc_auc'
- 'roc_auc_ovr'
- 'roc_auc_ovo'
- 'roc_auc_ovr_weighted'
- 'roc_auc_ovo_weighted'
- 'neg_mean_absolute_error'
- 'neg_mean_squared_error'
- 'neg_root_mean_squared_error'
- 'r2'
- 'explained_variance'
- 'max_error'
- 'neg_median_absolute_error'
- 'neg_mean_poisson_deviance'
- 'neg_mean_gamma_deviance'
"""
from sklearn.pipeline import Pipeline

# Define the number of features to select
k = 10


# Update classifiers to include feature selection
classifiers = {
    'lda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg': Pipeline([('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(max_iter=100000, random_state=42))]),
    'svc_linear': Pipeline([('select', SelectKBest(f_classif, k=k)), ('svc_linear', LinearSVC(dual='auto'))]),
    'qda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('qda', QuadraticDiscriminantAnalysis())]),
    'lda_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(penalty=None, max_iter=100000))]),
    'gnb': Pipeline([('select', SelectKBest(f_classif, k=k)), ('gnb', GaussianNB())]),
    'knn': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('knn', KNeighborsClassifier())]),
    'svc_rbf': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('svc', SVC(kernel='rbf'))]),
    'random_forest': Pipeline([('select', SelectKBest(f_classif, k=k)), ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))])
}

# Perform 5-fold cross-validation for each model
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='accuracy',
                             verbose=0,
                             n_jobs=-1)
    cv_results[model_name] = scores
    # print(f"{model_name}: Mean accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")

# Print the cross-validation results
for model_name, scores in cv_results.items():
    print(f"{model_name}: Mean accuracy = {scores.mean():.4f}, Std = {scores.std():.4f}")

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


lda: Mean accuracy = 0.5902, Std = 0.0147
log_reg: Mean accuracy = 0.5900, Std = 0.0147
svc_linear: Mean accuracy = 0.5901, Std = 0.0147
qda: Mean accuracy = 0.5505, Std = 0.0183
lda_poly: Mean accuracy = 0.5942, Std = 0.0197
log_reg_poly: Mean accuracy = 0.5946, Std = 0.0194
gnb: Mean accuracy = 0.5483, Std = 0.0208
knn: Mean accuracy = 0.5566, Std = 0.0080
svc_rbf: Mean accuracy = 0.5886, Std = 0.0129
random_forest: Mean accuracy = 0.5871, Std = 0.0115


In [None]:
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
    'n_estimators': [100],  # Changed from 'num_boost_round' to 'n_estimators'
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
}

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform randomized search
random_search = RandomizedSearchCV(estimator=xgb_model, 
                                   param_distributions=param_grid, 
                                   n_iter=500, 
                                   scoring='accuracy', 
                                   verbose=1, 
                                   n_jobs=-1)
random_search.fit(X_train, y_train)


# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'subsample': 0.8, 'objective': 'binary:logistic', 'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'lambda': 1.5, 'gamma': 0.2, 'eval_metric': 'logloss', 'colsample_bytree': 1.0, 'alpha': 0}
Best accuracy found:  0.6575898915424481


In [34]:
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
    'n_estimators': [100],  # Changed from 'num_boost_round' to 'n_estimators'
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
}

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train[courses], label=y_train)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform randomized search
random_search = RandomizedSearchCV(estimator=xgb_model, 
                                   param_distributions=param_grid, 
                                   n_iter=500, 
                                   scoring='accuracy', 
                                   verbose=1, 
                                   n_jobs=-1)
random_search.fit(X_train[courses], y_train)


# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Best parameters found:  {'subsample': 1.0, 'objective': 'binary:logistic', 'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.2, 'lambda': 1.5, 'gamma': 0, 'eval_metric': 'logloss', 'colsample_bytree': 0.6, 'alpha': 0.1}
Best accuracy found:  0.639114172461572


  _data = np.array(data, dtype=dtype, copy=copy,


In [35]:
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
    'n_estimators': [100],  # Changed from 'num_boost_round' to 'n_estimators'
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
}

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train[course_SEM], label=y_train)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform randomized search
random_search = RandomizedSearchCV(estimator=xgb_model, 
                                   param_distributions=param_grid, 
                                   n_iter=500, 
                                   scoring='accuracy', 
                                   verbose=1, 
                                   n_jobs=-1)
random_search.fit(X_train[course_SEM], y_train)


# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Best parameters found:  {'subsample': 0.8, 'objective': 'binary:logistic', 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.2, 'lambda': 2, 'gamma': 0, 'eval_metric': 'logloss', 'colsample_bytree': 1.0, 'alpha': 0.2}
Best accuracy found:  0.6444248344069906
