
## Supervised Learning
## Building a Student Intervention System

## Exploring the Data


In [1]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score

# Read student data
student_data = pd.read_csv("student-data.csv")
print("Student data read successfully!")

Student data read successfully!


## Data Preprocessing

In [2]:
# TODO: Calculate number of students
n_students = len(student_data)

# TODO: Calculate number of features
# Don't count label column
n_features = len(student_data.iloc[0]) - 1

# TODO: Calculate passing students
n_passed = len(student_data[student_data['passed'] == 'yes'])

# TODO: Calculate failing students
n_failed = len(student_data[student_data['passed'] == 'no'])

# TODO: Calculate graduation rate
grad_rate = float(n_passed)/n_students * 100

# Print the results
print("Total number of students （number of datapoints): {}".format(n_students))
print("Number of features: {}".format(n_features))
print("Number of students who passed (graduates): {}".format(n_passed))
print("Number of students who failed (non-graduates): {}".format(n_failed))
print("Graduation rate of the class: {:.2f}%".format(grad_rate))

Total number of students （number of datapoints): 395
Number of features: 30
Number of students who passed (graduates): 265
Number of students who failed (non-graduates): 130
Graduation rate of the class: 67.09%


In [3]:
student_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


In [4]:
# Experiment to see if `failures` are a good predictor of `passed`

student_data[['failures', 'passed']]
pp, pf, fp, ff = 0, 0, 0, 0
for i in range(len(student_data)):
    if student_data.iloc[i]['failures'] > 0:
        if student_data.iloc[i]['passed'] == 'no':
            ff += 1 # ff can be termed as failures = true and also fail = true
        else:
            fp += 1 # ff can be termed as failures = true and also pass = true
    else:
        if student_data.iloc[i]['passed'] == 'no':
            pf += 1 # ff can be termed as failures = false and also fail = true
        else:
            pp += 1 # ff can be termed as failures = false and also pass = true
print("pp: ", pp, "pf: ", pf, "fp: ", fp, "ff: ", ff)

# here exception can be person having no failures but failed in exam
# here exception can also be person having failures but passed the exam

pp:  234 pf:  78 fp:  31 ff:  52


## Preparing the Data

### Identify feature and target columns
It is often the case that the data you obtain contains non-numeric features. This can be a problem, as most machine learning algorithms expect numeric data to perform computations with.

In [5]:
# Extract feature columns
feature_cols = list(student_data.columns[:-1])

# Extract target column 'passed'
target_col = student_data.columns[-1] 

# Show the list of columns
print("Feature columns:\n{}".format(feature_cols))
print("\nTarget column: {}".format(target_col))

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = student_data[feature_cols]
y_all = student_data[target_col]

# Show the feature information by printing the first five rows
print("\nFeature values:")
print(X_all.head())

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target column: passed

Feature values:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  higher internet  romantic  famrel  freetime goout Dalc Walc health absences  
0    yes       no        no       4         3     4    1    1     

### Preprocess Feature Columns




In [7]:
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))
X_all.shape

Processed feature columns (48 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


(395, 48)

### Implementation: Training and Testing Data Split


In [8]:
# TODO: Import any additional functionality you may need here
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# TODO: Set the number of training points
num_train = 300

# Set the number of testing points
num_test = X_all.shape[0] - num_train

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=num_train, test_size=num_test)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 300 samples.
Testing set has 95 samples.


## Training and Evaluating Models


- Random Forest
- K-Nearest Neighbors (KNeighbors)
- Support Vector Machines (SVM)
- Logistic Regression

In [9]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))
    print("\n")

In [12]:
from sklearn.ensemble import RandomForestClassifier
clf_A = RandomForestClassifier(random_state=0)

In [17]:
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
        train_predict(clf_A, j[0], j[1], X_test, y_test)

# train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a RandomForestClassifier using a training set size of 100. . .
Trained model in 0.0180 seconds
Made predictions in 0.0035 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0035 seconds.
F1 score for test set: 0.7407.


Training a RandomForestClassifier using a training set size of 200. . .
Trained model in 0.0186 seconds
Made predictions in 0.0020 seconds.
F1 score for training set: 0.9964.
Made predictions in 0.0020 seconds.
F1 score for test set: 0.6719.


Training a RandomForestClassifier using a training set size of 300. . .
Trained model in 0.0191 seconds
Made predictions in 0.0035 seconds.
F1 score for training set: 0.9805.
Made predictions in 0.0050 seconds.
F1 score for test set: 0.7536.




In [24]:
from sklearn.linear_model import LogisticRegression

clf_B = LogisticRegression(random_state=0)
X_train_100 = X_train[:100]
y_train_100 = y_train[:100]

X_train_200 = X_train[:200]
y_train_200 = y_train[:200]

X_train_300 = X_train
y_train_300 = y_train

# TODO: Execute the 'train_predict' function for each classifier and each training set size
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
    train_predict(clf_C, j[0], j[1], X_test, y_test)


Training a LogisticRegression using a training set size of 100. . .
Trained model in 0.0125 seconds
Made predictions in 0.0025 seconds.
F1 score for training set: 0.9371.
Made predictions in 0.0030 seconds.
F1 score for test set: 0.6875.


Training a LogisticRegression using a training set size of 200. . .
Trained model in 0.0070 seconds
Made predictions in 0.0030 seconds.
F1 score for training set: 0.8808.
Made predictions in 0.0015 seconds.
F1 score for test set: 0.7445.


Training a LogisticRegression using a training set size of 300. . .
Trained model in 0.0075 seconds
Made predictions in 0.0015 seconds.
F1 score for training set: 0.8462.
Made predictions in 0.0020 seconds.
F1 score for test set: 0.7681.






In [25]:
from sklearn.svm import SVC
clf_C = SVC(random_state=0)
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
    train_predict(clf_C, j[0], j[1], X_test, y_test)


Training a SVC using a training set size of 100. . .
Trained model in 2.8749 seconds
Made predictions in 0.0211 seconds.
F1 score for training set: 0.8961.
Made predictions in 0.0035 seconds.
F1 score for test set: 0.7703.


Training a SVC using a training set size of 200. . .
Trained model in 0.0085 seconds
Made predictions in 0.0040 seconds.
F1 score for training set: 0.8797.
Made predictions in 0.0035 seconds.
F1 score for test set: 0.7919.


Training a SVC using a training set size of 300. . .
Trained model in 0.0287 seconds
Made predictions in 0.0130 seconds.
F1 score for training set: 0.8723.
Made predictions in 0.0055 seconds.
F1 score for test set: 0.7867.






In [26]:
from sklearn.neighbors import KNeighborsClassifier
clf_D = KNeighborsClassifier()
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
    train_predict(clf_C, j[0], j[1], X_test, y_test)

Training a SVC using a training set size of 100. . .
Trained model in 0.0075 seconds
Made predictions in 0.0025 seconds.
F1 score for training set: 0.8961.




Made predictions in 0.3766 seconds.
F1 score for test set: 0.7703.


Training a SVC using a training set size of 200. . .
Trained model in 0.0536 seconds
Made predictions in 0.0045 seconds.
F1 score for training set: 0.8797.
Made predictions in 0.0035 seconds.
F1 score for test set: 0.7919.


Training a SVC using a training set size of 300. . .
Trained model in 0.0630 seconds
Made predictions in 0.0090 seconds.
F1 score for training set: 0.8723.
Made predictions in 0.0091 seconds.
F1 score for test set: 0.7867.




