In [3]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [4]:
#read in the main dataset
grades = pd.read_csv('final_dataset_Nov_9.csv')

grades

Unnamed: 0,STUDENT,GRAD_SEM,265,265_SEM,166,166_SEM,165,165_SEM,207,207_SEM,...,181_SEM,415,415_SEM,497,497_SEM,145,145_SEM,342,342_SEM,y
0,0,8.0,-1,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,0.0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,8.0,0,0,1,2,1,1,1,4,...,0,0,0,0,0,0,0,0,0,1
3,5,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,7.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9306,13059,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9307,13060,10.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9308,13061,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9309,13062,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
courses = ['265', '166', '165', '207', '140', '104', '385', '267', '317', '201', '150', '143', '105', '101', '266', '373', '341', '301', '414', '435', '365', '436', '492', '160', '314', '397', '495', '302', '350', '490', '304', '240', '106', '151', '181', '415', '497', '145', '342']

gen_crses = [str(x) for x in [105,
                                140,
                                143,
                                145,
                                150,
                                151,
                                160,
                                165,
                                166,
                                207,
                                265,
                                266,
                                267]]
gen_crses_SEM = [f'{x}_SEM' for x in gen_crses]

print(len(courses),len(gen_crses))

39 13


In [6]:
X = grades[courses]
X_SEM = grades[gen_crses_SEM]
y = grades['y']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_SEM, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

classifiers = {
    # Putting linear decision boundary classifiers first
    'lda' : LinearDiscriminantAnalysis(),
    'log_reg' : LogisticRegression(max_iter= 100000, random_state=42),
    'svc_linear' : LinearSVC(dual = 'auto'),

    # Quadratic boundaries
    'qda' : QuadraticDiscriminantAnalysis(),
    'lda_poly' : Pipeline([('scale', StandardScaler()),
                           ('poly',PolynomialFeatures(2)),
                           ('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly' : Pipeline([('scale', StandardScaler()),
                               ('poly',PolynomialFeatures(2)),('log_reg', LogisticRegression(penalty=None, max_iter= 100000))]),
    'gnb' : GaussianNB(),

    # Complex boundaries
    'knn' : Pipeline([('scale', StandardScaler()),('knn', KNeighborsClassifier())]),   
    'svc_rbf' : Pipeline([('scale', StandardScaler()),('svc',SVC(kernel= 'rbf'))]),
    'random_forest' : RandomForestClassifier(random_state=42,
                                             class_weight='balanced')
}

In [10]:
from sklearn.feature_selection import SelectKBest, f_classif
"""
This script performs feature selection and model evaluation using various classifiers with a pipeline approach.

The script includes the following steps:
1. Import necessary libraries and modules.
2. Define the number of features to select using SelectKBest.
3. Create a dictionary of classifiers, each with a pipeline that includes feature selection and the classifier.
4. Perform 5-fold cross-validation for each model.
5. Print the cross-validation results, including mean accuracy and standard deviation for each model.

Classifiers included:
- Linear Discriminant Analysis (LDA)
- Logistic Regression
- Linear Support Vector Classifier (SVC)
- Quadratic Discriminant Analysis (QDA)
- Polynomial LDA
- Polynomial Logistic Regression
- Gaussian Naive Bayes (GNB)
- K-Nearest Neighbors (KNN)
- Radial Basis Function (RBF) SVC
- Random Forest

Parameters:
- k (int): The number of features to select using SelectKBest.

Possible values for the parameter "scoring" in cross_val_score:
- 'accuracy'
- 'adjusted_rand_score'
- 'average_precision'
- 'balanced_accuracy'
- 'brier_score_loss'
- 'f1'
- 'f1_micro'
- 'f1_macro'
- 'f1_weighted'
- 'f1_samples'
- 'neg_log_loss'
- 'precision'
- 'recall'
- 'roc_auc'
- 'roc_auc_ovr'
- 'roc_auc_ovo'
- 'roc_auc_ovr_weighted'
- 'roc_auc_ovo_weighted'
- 'neg_mean_absolute_error'
- 'neg_mean_squared_error'
- 'neg_root_mean_squared_error'
- 'r2'
- 'explained_variance'
- 'max_error'
- 'neg_median_absolute_error'
- 'neg_mean_poisson_deviance'
- 'neg_mean_gamma_deviance'
"""
from sklearn.pipeline import Pipeline

# Define the number of features to select
k = 10


# Update classifiers to include feature selection
classifiers = {
    'lda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg': Pipeline([('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(max_iter=100000, random_state=42))]),
    'svc_linear': Pipeline([('select', SelectKBest(f_classif, k=k)), ('svc_linear', LinearSVC(dual='auto'))]),
    'qda': Pipeline([('select', SelectKBest(f_classif, k=k)), ('qda', QuadraticDiscriminantAnalysis())]),
    'lda_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('lda', LinearDiscriminantAnalysis())]),
    'log_reg_poly': Pipeline([('scale', StandardScaler()), ('poly', PolynomialFeatures(2)), ('select', SelectKBest(f_classif, k=k)), ('log_reg', LogisticRegression(penalty=None, max_iter=100000))]),
    'gnb': Pipeline([('select', SelectKBest(f_classif, k=k)), ('gnb', GaussianNB())]),
    'knn': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('knn', KNeighborsClassifier())]),
    'svc_rbf': Pipeline([('scale', StandardScaler()), ('select', SelectKBest(f_classif, k=k)), ('svc', SVC(kernel='rbf'))]),
    'random_forest': Pipeline([('select', SelectKBest(f_classif, k=k)), ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))])
}

# Perform 5-fold cross-validation for each model
cv_results = {}
for model_name, model in classifiers.items():
    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=5,
                             scoring='neg_log_loss')
    cv_results[model_name] = scores
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

# Print the cross-validation results
for model_name, scores in cv_results.items():
    print(f"{model_name}: Mean -logloss = {scores.mean():.4f}, Std = {scores.std():.4f}")

lda: Mean -logloss = -0.6733, Std = 0.0054
log_reg: Mean -logloss = -0.6734, Std = 0.0058
svc_linear: Mean -logloss = nan, Std = nan
qda: Mean -logloss = -1.2436, Std = 0.0257
lda_poly: Mean -logloss = -0.6723, Std = 0.0061


Traceback (most recent call last):
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 370, in _score
    response_method = _check_response_method(estimator, self._response_method)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/utils/validation.py", line 2145, in _check_response_method
    raise AttributeError(
AttributeError: Pipeline has none of the following attributes: predict_proba.

Traceback (most recent call last):
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/p

log_reg_poly: Mean -logloss = -0.6721, Std = 0.0061
gnb: Mean -logloss = -1.2231, Std = 0.0270
knn: Mean -logloss = -2.7094, Std = 0.5358


Traceback (most recent call last):
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 370, in _score
    response_method = _check_response_method(estimator, self._response_method)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/utils/validation.py", line 2145, in _check_response_method
    raise AttributeError(
AttributeError: Pipeline has none of the following attributes: predict_proba.

Traceback (most recent call last):
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/erdos_fall_2024/lib/p

svc_rbf: Mean -logloss = nan, Std = nan
random_forest: Mean -logloss = -0.7577, Std = 0.0407
lda: Mean -logloss = -0.6733, Std = 0.0054
log_reg: Mean -logloss = -0.6734, Std = 0.0058
svc_linear: Mean -logloss = nan, Std = nan
qda: Mean -logloss = -1.2436, Std = 0.0257
lda_poly: Mean -logloss = -0.6723, Std = 0.0061
log_reg_poly: Mean -logloss = -0.6721, Std = 0.0061
gnb: Mean -logloss = -1.2231, Std = 0.0270
knn: Mean -logloss = -2.7094, Std = 0.5358
svc_rbf: Mean -logloss = nan, Std = nan
random_forest: Mean -logloss = -0.7577, Std = 0.0407


In [11]:
for model_name, model in classifiers.items():
    model.fit(X_train,y_train)

accs = {model_name: accuracy_score(y_test, model.predict(X_test)) for model_name, model in classifiers.items()}

accs

  f = msb / msw
  f = msb / msw


{'lda': 0.6049382716049383,
 'log_reg': 0.6044015029522276,
 'svc_linear': 0.6049382716049383,
 'qda': 0.596886741814278,
 'lda_poly': 0.6092324208266238,
 'log_reg_poly': 0.6113794954374665,
 'gnb': 0.5909822866344605,
 'knn': 0.6044015029522276,
 'svc_rbf': 0.6253354804079442,
 'random_forest': 0.6312399355877617}