In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.preprocessing import StandardScaler

# Data Preprocessing

In [2]:
student_data = pd.read_csv('./data.csv', delimiter=';') #reading the csv through excel showed me that the values were seperated by ';'
#because of that I have to specify the delimiter as ';' to prevent all the data entries from being stored as one feature

### Checking for Null Values

In [3]:
empty_vals = pd.DataFrame(student_data.isnull().sum(), columns=['Is null'])

In [4]:
empty_vals.sum()

Is null    0
dtype: int64

As we can see above, the authors of this dataset have already imputed missing values so there is nothing to fix in that regard

### Encode the target label
Here we are simpling converting the target to a numerical value, rather than 'Dropped', 'Enrolled', or 'Graduate'

In [5]:
X, Y = student_data.iloc[:,:-1], student_data.iloc[:,-1:]

In [6]:
X = X.to_numpy()
Y = Y.to_numpy()
Y = Y.ravel()

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()
Y = le.fit_transform(Y)
print(Y)

[0 2 0 ... 0 2 2]


In [9]:
num_dropped = np.count_nonzero(Y==0)
num_enrolled = np.count_nonzero(Y==1)
num_graduated = np.count_nonzero(Y==2)
print(f"Number that graduated: {num_graduated}")
print(f"Number that were still enrolled: {num_enrolled}")
print(f"Number that dropped: {num_dropped}")

Number that graduated: 2209
Number that were still enrolled: 794
Number that dropped: 1421


### Normalize the data

In [10]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Evaluating the Models

In [11]:
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)
svc_linear = SVC(kernel='linear', max_iter=1000000)
svc_poly = SVC(kernel='poly')
svc_radial = SVC(kernel='rbf')

In [13]:
svc_linear.fit(x_train, y_train)
svc_poly.fit(x_train, y_train)
svc_radial.fit(x_train, y_train)
print(f"Linear kernel score: {svc_linear.score(x_test, y_test)}")
print(f"Polynomial kernel score: {svc_poly.score(x_test, y_test)}")
print(f"Radial (rbf) kernel score: {svc_radial.score(x_test, y_test)}")

Linear kernel score: 0.7902350813743219
Polynomial kernel score: 0.7079566003616636
Radial (rbf) kernel score: 0.7766726943942134


### Cross validation

In [14]:
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
#this combines the stratified KFold with the ShuffleSplit

In [15]:
sss = StratifiedShuffleSplit(test_size=0.4)
models=[SVC(kernel='linear'), SVC(kernel='poly'), SVC(kernel='rbf')]

In [16]:
#credit to chatgpt for the code below
for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=sss, scoring='accuracy')
    print(f"{model.kernel}: {scores.mean():.4f} accuracy")

linear: 0.7566 accuracy
poly: 0.6995 accuracy
rbf: 0.7448 accuracy


We can see that overall, the linear model is more accurate than the polynomial or rbf kernels. However, the difference in accuracy between the linear and rbf kernels is small enough that we will still consider the rbf kernel for further testing (it may turn out that rbf can outperform the linear kernel given the right hyper parameters).


Lets also check the confusion matrices of the rbf and linear models:

In [17]:
from sklearn.metrics import confusion_matrix

svc_radial.fit(x_train, y_train)
svc_linear.fit(x_train, y_train)

radial_predictions = svc_radial.predict(x_test)
linear_predictions = svc_linear.predict(x_test)

radial_confusion = confusion_matrix(radial_predictions, y_test)
linear_confusion = confusion_matrix(linear_predictions, y_test)

In [18]:
print("radial confusion matrix:")
print(radial_confusion)
print()
print("linear confusion matrix:")
print(linear_confusion)

radial confusion matrix:
[[266  36  13]
 [ 41  76  11]
 [ 62  84 517]]

linear confusion matrix:
[[274  41  10]
 [ 39  80  11]
 [ 56  75 520]]


With a confusion matrix, the diagonal tells us the number of accurate labels for each class. It looks like the middle column (which is the students who are still enrolled) is biggest barrier to accuracy

## Optimizing Hyperparameters
Lets first try to see how much we can improve accuracy just by adjusting the hyperparameters

In [19]:
from sklearn.model_selection import RandomizedSearchCV

In [20]:
#chatGPT wrote the following  (and the code in the next cell)
from scipy.stats import uniform, randint
param_dist_svc = {
    'C': uniform(0.1, 10),            # Penalty parameter C of the error term
    'kernel': ['linear'],  # Kernel type
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7)),  # Kernel coefficient for 'rbf'
    # 'degree': randint(2, 5),           # Degree of the polynomial kernel ('poly')
    'coef0': uniform(-1, 1)            # Independent term in kernel function
}


In [21]:
svm = SVC()

random_search = RandomizedSearchCV(svm, param_distributions=param_dist_svc, n_iter=25, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

# Fit the model
random_search.fit(x_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
test_accuracy = best_model.score(x_test, y_test)
print("Test Accuracy:", test_accuracy)

Best Hyperparameters: {'C': 5.804439744053994, 'coef0': -0.47916573997417633, 'gamma': 100.0, 'kernel': 'linear'}
Test Accuracy: 0.786618444846293


In [29]:
model = SVC(kernel='linear', gamma=100, C=5)
model.fit(x_train, y_train)

In [30]:
model.score(x_test, y_test)

0.7875226039783002

# Iteration
One thing we notice is that this dataset has three target classes. 
The extra target class is 'enrolled', which is somewhat irrelevant to predicting student outcomes. Also, there is not a clear intuitive correlation between enrollment status and the data features, since a student who is enrolled can be very successful or on the verge of dropping.

For that reason, we test the models *without* the enrollment class

### Removing extra class


In [31]:
student_data_binary = student_data[student_data['Target']!='Enrolled']
# student_data_binary.iloc[:20,-1]

Here we are just preparing the data in the same way we did above. The only difference now is that we are ignoring the 'enrolled' students, since they interfere with out models.

In [46]:
X_binary, Y_binary = student_data_binary.iloc[:,:-1], student_data_binary.iloc[:,-1:]
X_binary = X_binary.to_numpy()
Y_binary = Y_binary.to_numpy()
Y_binary = Y_binary.ravel()
Y_binary = le.fit_transform(Y_binary)

scaler = StandardScaler()
X_binary = scaler.fit_transform(X_binary)

In [47]:
x_train_binary, x_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, Y_binary)
svc_linear_binary = SVC(kernel='linear', max_iter=1000000)
svc_radial_binary = SVC(kernel='rbf')

In [34]:
svc_linear_binary.fit(x_train_binary, y_train_binary)
svc_radial_binary.fit(x_train_binary, y_train_binary)
print(f"Linear scores on binary data: {svc_linear_binary.score(x_test_binary, y_test_binary)}")
print(f"Radial scores on binary data: {svc_radial_binary.score(x_test_binary, y_test_binary)}")

Linear scores on binary data: 0.9030837004405287
Radial scores on binary data: 0.9008810572687225


As we can see, these scores are significantly higher than when we had 3 classes.
This confirms our earlier prediction that there is not a strong correlation between the data features and whether a student is enrolled.

### Combining

One idea is to use ensemble voting to improve the accuracy. Since the models are much better when they do not have to classify enrolled students, it could make sense to have the binary classifiers take the lead in deciding whether a student graduated or dropped, and then have the three-way classifiers jump in when they predict a student is enrolled.

It is not clear how to accomplish this with a normal VotingClassifier from scikit-learn, so here define one of our own. 

In [37]:
svc_linear = SVC(kernel='linear', probability=True, max_iter=1000000)
svc_radial = SVC(kernel='rbf', probability=True, C=4)
svc_linear_binary = SVC(kernel='linear', max_iter=1000000, probability=True)
svc_radial_binary = SVC(kernel='rbf')
svc_linear.fit(x_train, y_train)
svc_linear_binary.fit(x_train_binary, y_train_binary)

In [39]:
def classifier(linear_binary_classifier, linear_nonbinary_classifier, radial_nonbinary_classifier, input_data):
    linear_binary_pred = linear_binary_classifier.predict(input_data)
    linear_nonbinary_pred = linear_nonbinary_classifier.predict(input_data)

    linear_nonbinary_prob = linear_nonbinary_classifier.predict_proba(input_data)
    radial_nonbinary_prob = radial_nonbinary_classifier.predict_proba(input_data)
    ans = np.copy(linear_binary_pred)
    for i in range(len(ans)):
        if ans[i]==1:
            ans[i]=2
        if (linear_nonbinary_prob[i,1]>0.4 and radial_nonbinary_prob[i,1]>0.4):
            ans[i]=1
    return ans

In [40]:
svc_linear.fit(x_train, y_train)
svc_linear_binary.fit(x_train_binary, y_train_binary)
svc_radial.fit(x_train, y_train)
predictions = classifier(svc_linear_binary, svc_linear, svc_radial, x_test)

print(predictions)
print(confusion_matrix(predictions, y_test))

[2 2 2 ... 0 2 2]
[[288  51  13]
 [ 27  62   3]
 [ 54  83 525]]


In [41]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7911392405063291


### Analysis:
As we can see, using extra, more sophisticated classifiers to single out the 'enrolled' students is not that successful.
The best thing to try would be a one-vs-rest approach (ovr) to first classify whether or not a student is enrolled, and then (if they are not enrolled) clsasify them as graduate vs dropped.
For the time being however, we will use the data that has been binarized and use it for further analysis.


# Optimizing Hyperparameters
Now that we have a better sense of how we want to classify the data, we are going to again optimize our hyperparameteres, only this time we will be looking for different results. Also, we will use be using Grid Search.
(the next several cells were done by chatGPT)

In [56]:
X_binary, Y_binary = student_data_binary.iloc[:,:-1], student_data_binary.iloc[:,-1:]
X_binary = X_binary.to_numpy()
Y_binary = Y_binary.to_numpy()
Y_binary = Y_binary.ravel()
Y_binary = le.fit_transform(Y_binary)

scaler = StandardScaler()
X_binary = scaler.fit_transform(X_binary)

In [57]:
x_train_binary, x_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, Y_binary)
svc_linear_binary = SVC(kernel='linear', max_iter=-1)
svc_radial_binary = SVC(kernel='rbf')

Optimizing the linear models parameters:

In [67]:
from sklearn.model_selection import GridSearchCV

linear_param_grid = {'C': [3, 4, 5, 6, 8, 10, 12, 15]} #the range of values to offer C was decided through a lot of iteration
linear_grid_search = GridSearchCV(estimator=svc_linear_binary, param_grid=linear_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
linear_grid_search.fit(x_train_binary, y_train_binary)

In [68]:
# Get the best parameters from the grid search
best_params = linear_grid_search.best_params_

# Use the best parameters to train the final model
final_linear_model = SVC(kernel='linear', C=best_params['C'])
final_linear_model.fit(x_train_binary, y_train_binary)

# Make predictions on the test set
y_pred = final_linear_model.predict(x_test_binary)

# Evaluate the accuracy
accuracy = accuracy_score(y_test_binary, y_pred)
print(f"Accuracy: {accuracy}")
print("Best Hyperparameters:", best_params)

Accuracy: 0.920704845814978
Best Hyperparameters: {'C': 6}


Optimizing the radial parameters:

In [81]:
# Define the parameter grid to search
radial_param_grid = {'C': [20, 23, 24, 25, 27], 'gamma': [0.001, 0.0001, 0.0008, 0.0005, 0.05, 0.005, 0.01]} #the range of the hyperparameters was determined through iteration
grid_search_rbf = GridSearchCV(estimator=svc_radial_binary, param_grid=radial_param_grid, cv=4, scoring='accuracy', n_jobs=-1)
grid_search_rbf.fit(x_train_binary, y_train_binary)

In [82]:
# Get the best parameters from the grid search
best_params_rbf = grid_search_rbf.best_params_

# Use the best parameters to train the final model
final_model_rbf = SVC(kernel='rbf', C=best_params_rbf['C'], gamma=best_params_rbf['gamma'])
final_model_rbf.fit(x_train_binary, y_train_binary)

# Make predictions on the test set
y_pred_rbf = final_model_rbf.predict(x_test_binary)

# Evaluate the accuracy
accuracy_rbf = accuracy_score(y_test_binary, y_pred_rbf)
print(f"Accuracy with RBF kernel: {accuracy_rbf}")
print("Best Hyperparameters with RBF kernel:", best_params_rbf)

Accuracy with RBF kernel: 0.9162995594713657
Best Hyperparameters with RBF kernel: {'C': 25, 'gamma': 0.001}


# Ensembling

In [83]:
from sklearn.ensemble import VotingClassifier

In [94]:
svc_linear_binary = SVC(kernel='linear', C=6, max_iter=-1, probability=True)
svc_radial_binary = SVC(kernel='rbf', C=25, gamma=0.001, probability=True)
vc = VotingClassifier(estimators=[('linear', svc_linear_binary), ('rbf', svc_radial_binary)], voting='soft')
vc.fit(x_train_binary, y_train_binary)
vc.score(x_test_binary, y_test_binary)

0.9196035242290749

In [95]:
svc_linear_binary.fit(x_train_binary, y_train_binary)
svc_linear_binary.score(x_test_binary, y_test_binary)

0.920704845814978

The ensemble is not as good as a simple linear classifier. But there is still room for improvement!
## Bootstrap Aggregating
(credit to chatGPT)

In [108]:
num_models = 5

# Create and train linear SVM models on different subsets of the data
svm_models = [SVC(kernel='linear', random_state=i) for i in range(num_models)]

for i, model in enumerate(svm_models):
    # Create a subset of the training data (you can use different strategies for creating subsets)
    x_subset, _, y_subset, _ = train_test_split(x_train_binary, y_train_binary, test_size=0.2, random_state=i)
    
    # Train the model on the subset
    model.fit(x_subset, y_subset)

# Create a VotingClassifier with linear SVM models
voting_classifier = VotingClassifier(estimators=[('svm_model_'+str(i), model) for i, model in enumerate(svm_models)], voting='hard')

# Fit the ensemble model on the entire training set
voting_classifier.fit(x_train_binary, y_train_binary)

# Make predictions on the test set
y_pred = voting_classifier.predict(x_test_binary)

# Evaluate the accuracy
accuracy = accuracy_score(y_test_binary, y_pred)
print(f"Ensemble Accuracy: {accuracy}")

Ensemble Accuracy: 0.920704845814978
