# Student Exam Performance

## Package Import and Configuration

In [0]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, train_test_split

## Data Import and Preprocessing


In [0]:
exam_passing = pd.read_csv('https://github.com/saschaschworm/dsb/raw/master/Data%20Sets/Demos%20and%20Exercises/exam_passing/exam_passing.csv', header=None)
exam_passing.columns = ['hours_studied', 'hours_slept', 'passed']

### Data Preview

In [3]:
exam_passing.head()

Unnamed: 0,hours_studied,hours_slept,passed
0,4.855064,9.639962,1
1,8.62544,0.058927,0
2,3.828192,0.723199,0
3,7.150955,3.89942,1
4,6.4779,8.198181,1


### Data Preprocessing

There is not data preprocessing step in this example. The only thing we do here is to transform the features to a matrix $X$ and the target variable to a vector $y$.

In [0]:
X, y = exam_passing[['hours_studied', 'hours_slept']].values, exam_passing['passed'].values

## Modelling

### Training with Logistic Regression with Stochastic Gradient Descent

In [5]:
# Set Seed for "Deterministic Randomness".
np.random.seed(1909)

# Initialize the Logisitc Regression Model.
logistic_model = SGDClassifier(loss='log', eta0=0.001, alpha=0.03, max_iter=1000)

# Perform the Learning.
logistic_model.fit(X, y)

SGDClassifier(alpha=0.03, average=False, class_weight=None, epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

### Prediction

In [0]:
# Perform the Prediction for a student has been sleeping 20 hours and has been learning 8 hours.
y_pred = logistic_model.predict([[20, 8]])
y_pred_probabilities = logistic_model.predict_proba([[20, 8]])

### Result

In [7]:
print('Exam performance for a student has been sleeping 20 hours and has been learning 8 hours:')

# Predict class for a student who has slept 20 hours and has learnt 8 hours.
print(f'- Passing: {bool(y_pred[0])}')

# Predict class probabilities for a student who has slept 20 hours and has learnt 8 hours.
print(f'- Probability on failing: {y_pred_probabilities[0][0] * 100:.2f}%')
print(f'- Probabilitiy on passing: {y_pred_probabilities[0][1] * 100:.2f}%')

Exam performance for a student has been sleeping 20 hours and has been learning 8 hours:
- Passing: True
- Probability on failing: 0.00%
- Probabilitiy on passing: 100.00%


## Evaluation

### Holdout Method

#### Training on Training Set with Logistic Regression with Stochastic Gradient Descent

In [8]:
# Set Seed for "Deterministic Randomness".
np.random.seed(1909)

# Train-Test-Split with 75% for Training and 25% for Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Perform the Learning
logistic_model.fit(X_train, y_train)

SGDClassifier(alpha=0.03, average=False, class_weight=None, epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

#### Prediction on Test Set

In [0]:
# Perform the Prediction on the Test-Set-Features.
y_pred = logistic_model.predict(X_test)
y_pred_probabilities = logistic_model.predict_proba(X_test)

#### Result

In [10]:
# Calculating various performance measures.
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1: {f1 * 100:.2f}%')

Accuracy: 92.00%
Precision: 100.00%
Recall: 81.82%
F1: 90.00%


### 10-Fold-Cross-Validation

#### Algorithm

In [0]:
# Set seed for "deterministic randomness".
np.random.seed(1909)

# Initialize 10-Fold-Cross-Validation.
k_fold = KFold(n_splits=10)

# Empty lists for persisting the performances measures calculated in each iteration. 
accuracies = []
precisions = []
recalls = []
f1s = []

for train_idx, test_idx in k_fold.split(X):
  # Split dataset into a train and test set.
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  
  # Run the logistic regression on the current training set.
  logistic_model = SGDClassifier(loss='log', eta0=0.001, alpha=0.03, max_iter=1000)
  logistic_model.fit(X_train, y_train)
  
  # Perform the Prediction on the Test-Set-Features.
  y_pred = logistic_model.predict(X_test)
  y_pred_probabilities = logistic_model.predict_proba(X_test)
  
  # Calculate the performance measures on the current test set.
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  
  # Append to performances measures to lists.
  accuracies.append(accuracy)
  precisions.append(precision)
  recalls.append(recall)
  f1s.append(f1)

#### Result

In [12]:
print(f'Average accuracy: {np.mean(accuracies) * 100:.2f}%')
print(f'Average precision: {np.mean(precisions) * 100:.2f}%')
print(f'Average recall: {np.mean(recalls) * 100:.2f}%')
print(f'Average F1: {np.mean(f1s) * 100:.2f}%')

Average accuracy: 88.00%
Average precision: 87.44%
Average recall: 89.67%
Average F1: 87.72%
