# Student Exam Performance

## Package Import and Configuration

In [0]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold, train_test_split

In [0]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.9f}".format(x)})

## Data Import and Preprocessing


In [0]:
exam_passing = pd.read_csv('https://github.com/saschaschworm/dsb/raw/master/Data%20Sets/Demos%20and%20Exercises/exam_passing/exam_passing.csv', header=None, names=['hours_studied', 'hours_slept', 'passed'])

### Data Preview

In [4]:
exam_passing.head()

Unnamed: 0,hours_studied,hours_slept,passed
0,4.855064,9.639962,1
1,8.62544,0.058927,0
2,3.828192,0.723199,0
3,7.150955,3.89942,1
4,6.4779,8.198181,1


### Data Preprocessing

There is not data preprocessing step in this example. The only thing we do here is to transform the features to a matrix $X$ and the target variable to a vector $y$.

In [0]:
X, y = exam_passing[['hours_studied', 'hours_slept']].values, exam_passing['passed'].values

## Model Training

### Logistic Regression with Stochastic Gradient Descent

In [6]:
# Set seed for "deterministic randomness".
np.random.seed(1909)

# Initialize the Logisitc Regression Model.
logistic_model = SGDClassifier(loss='log', eta0=0.001, alpha=0.03, max_iter=1000)

# Perform the learning.
logistic_model.fit(X, y)

SGDClassifier(alpha=0.03, average=False, class_weight=None, epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [7]:
# Predict class probabilities for a student who has slept 20 hours and has learnt 8 hours.
print(f'Class probabilities [0, 1]: {logistic_model.predict_proba([[20, 8]])}')

# Predict class probabilities for a student who has slept 20 hours and has learnt 8 hours.
print(f'Prediction: {logistic_model.predict([[20, 8]])}')

Class probabilities [0, 1]: [[0.000000003 0.999999997]]
Prediction: [1]


## Evaluation

### Initialization

In [0]:
# Initialize 10-Fold-Cross-Validation.
k_fold = KFold(n_splits=10)

# Empty list for persisting the precisions calculated in each iteration. 
precisions = []

### 10-Fold-Cross-Validation

In [0]:
# Set seed for "deterministic randomness".
np.random.seed(1909)

for train_idx, test_idx in k_fold.split(X):
  # Split dataset into a train and test set.
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  
  # Run the logistic regression on the current training set.
  logistic_model = SGDClassifier(loss='log', eta0=0.001, alpha=0.03, max_iter=1000)
  logistic_model.fit(X_train, y_train)
  
  # Calculate the precision on the current test set.
  precision = precision_score(logistic_model.predict(X_test), y_test)
  
  # Append to precision to list.
  precisions.append(precision)

### Result

In [10]:
print(f'Average precision: {np.mean(precisions) * 100:.2f}%')

Average precision: 89.67%
