
## Supervised Learning
## Building a Student Intervention System

## Exploring the Data


In [1]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Read student data
student_data = pd.read_csv("student-data.csv")
print("Student data read successfully!")

Student data read successfully!


## Data Preprocessing

In [2]:
# TODO: Calculate number of students
n_students = len(student_data)

# TODO: Calculate number of features
# Don't count label column
n_features = len(student_data.iloc[0]) - 1

# TODO: Calculate passing students
n_passed = len(student_data[student_data['passed'] == 'yes'])

# TODO: Calculate failing students
n_failed = len(student_data[student_data['passed'] == 'no'])

# TODO: Calculate graduation rate
grad_rate = float(n_passed)/n_students * 100

# Print the results
print("Total number of students （number of datapoints): {}".format(n_students))
print("Number of features: {}".format(n_features))
print("Number of students who passed (graduates): {}".format(n_passed))
print("Number of students who failed (non-graduates): {}".format(n_failed))
print("Graduation rate of the class: {:.2f}%".format(grad_rate))

Total number of students （number of datapoints): 395
Number of features: 30
Number of students who passed (graduates): 265
Number of students who failed (non-graduates): 130
Graduation rate of the class: 67.09%


In [3]:
student_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


In [4]:
# Experiment to see if `failures` are a good predictor of `passed`

student_data[['failures', 'passed']]
pp, pf, fp, ff = 0, 0, 0, 0
for i in range(len(student_data)):
    if student_data.iloc[i]['failures'] > 0:
        if student_data.iloc[i]['passed'] == 'no':
            ff += 1 # ff can be termed as failures = true and also fail = true
        else:
            fp += 1 # ff can be termed as failures = true and also pass = true
    else:
        if student_data.iloc[i]['passed'] == 'no':
            pf += 1 # ff can be termed as failures = false and also fail = true
        else:
            pp += 1 # ff can be termed as failures = false and also pass = true
print("pp: ", pp, "pf: ", pf, "fp: ", fp, "ff: ", ff)

# here exception can be person having no failures but failed in exam
# here exception can also be person having failures but passed the exam

pp:  234 pf:  78 fp:  31 ff:  52


## Preparing the Data

### Identify feature and target columns
It is often the case that the data you obtain contains non-numeric features. This can be a problem, as most machine learning algorithms expect numeric data to perform computations with.

In [5]:
# Extract feature columns
feature_cols = list(student_data.columns[:-1])

# Extract target column 'passed'
target_col = student_data.columns[-1] 

# Show the list of columns
print("Feature columns:\n{}".format(feature_cols))
print("\nTarget column: {}".format(target_col))

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = student_data[feature_cols]
y_all = student_data[target_col]

# Show the feature information by printing the first five rows
print("\nFeature values:")
print(X_all.head())

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target column: passed

Feature values:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  higher internet  romantic  famrel  freetime goout Dalc Walc health absences  
0    yes       no        no       4         3     4    1    1     

### Preprocess Feature Columns




In [6]:
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))
X_all.shape

Processed feature columns (48 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


(395, 48)

In [7]:
X_all.head()

Unnamed: 0,school_GP,school_MS,sex_F,sex_M,age,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,1,0,1,0,18,0,1,1,0,1,...,1,0,0,4,3,4,1,1,3,6
1,1,0,1,0,17,0,1,1,0,0,...,1,1,0,5,3,3,1,1,3,4
2,1,0,1,0,15,0,1,0,1,0,...,1,1,0,4,3,2,2,3,3,10
3,1,0,1,0,15,0,1,1,0,0,...,1,1,1,3,2,2,1,1,5,2
4,1,0,1,0,16,0,1,1,0,0,...,1,0,0,4,3,2,1,2,5,4


### Implementation: Training and Testing Data Split


In [8]:
# TODO: Import any additional functionality you may need here
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# TODO: Set the number of training points
num_train = 300

# Set the number of testing points
num_test = X_all.shape[0] - num_train

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=num_train, test_size=num_test)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 300 samples.
Testing set has 95 samples.


## Training and Evaluating Models


- Random Forest
- K-Nearest Neighbors (KNeighbors)
- Support Vector Machines (SVM)
- Logistic Regression

In [9]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return (f1_score(target.values, y_pred, pos_label='yes'), accuracy_score(target.values,y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    f1_train, acc_train = predict_labels(clf, X_train, y_train)
    f1_test, acc_test = predict_labels(clf, X_test, y_test)
    # Print the results of prediction for both training and testing
    print("F1 score for training set: {:.4f}.".format(f1_train))
    print("F1 score for test set: {:.4f}.".format(f1_test))
    print("acc score for training set: {:.4f}.".format(acc_train))
    print("acc score for test set: {:.4f}.".format(acc_test))
    print("\n")

In [10]:
from sklearn.ensemble import RandomForestClassifier
clf_A = RandomForestClassifier(max_depth=3, n_estimators=21, random_state=1)

In [11]:
X_train_100 = X_train[:100]
y_train_100 = y_train[:100]

X_train_200 = X_train[:200]
y_train_200 = y_train[:200]

X_train_300 = X_train
y_train_300 = y_train
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
        train_predict(clf_A, j[0], j[1], X_test, y_test)

# train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a RandomForestClassifier using a training set size of 100. . .
Trained model in 0.1334 seconds
Made predictions in 0.0286 seconds.
Made predictions in 0.0135 seconds.
F1 score for training set: 0.8806.
F1 score for test set: 0.7586.
acc score for training set: 0.8400.
acc score for test set: 0.6316.


Training a RandomForestClassifier using a training set size of 200. . .
Trained model in 0.2146 seconds
Made predictions in 0.0085 seconds.
Made predictions in 0.0095 seconds.
F1 score for training set: 0.8258.
F1 score for test set: 0.7973.
acc score for training set: 0.7300.
acc score for test set: 0.6842.


Training a RandomForestClassifier using a training set size of 300. . .
Trained model in 0.1750 seconds
Made predictions in 0.0105 seconds.
Made predictions in 0.0211 seconds.
F1 score for training set: 0.8310.
F1 score for test set: 0.8082.
acc score for training set: 0.7233.
acc score for test set: 0.7053.




In [12]:
from sklearn.linear_model import LogisticRegression

clf_C = LogisticRegression(solver='liblinear', random_state=1)
X_train_100 = X_train[:100]
y_train_100 = y_train[:100]

X_train_200 = X_train[:200]
y_train_200 = y_train[:200]

X_train_300 = X_train
y_train_300 = y_train

# TODO: Execute the 'train_predict' function for each classifier and each training set size
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
    train_predict(clf_C, j[0], j[1], X_test, y_test)


Training a LogisticRegression using a training set size of 100. . .
Trained model in 0.0060 seconds
Made predictions in 0.0020 seconds.
Made predictions in 0.0025 seconds.
F1 score for training set: 0.9194.
F1 score for test set: 0.6783.
acc score for training set: 0.9000.
acc score for test set: 0.6105.


Training a LogisticRegression using a training set size of 200. . .
Trained model in 0.0055 seconds
Made predictions in 0.2186 seconds.
Made predictions in 0.0015 seconds.
F1 score for training set: 0.8603.
F1 score for test set: 0.8000.
acc score for training set: 0.8100.
acc score for test set: 0.7368.


Training a LogisticRegression using a training set size of 300. . .
Trained model in 0.0095 seconds
Made predictions in 0.0065 seconds.
Made predictions in 0.0035 seconds.
F1 score for training set: 0.8468.
F1 score for test set: 0.8154.
acc score for training set: 0.7733.
acc score for test set: 0.7474.




In [13]:
from xgboost import XGBClassifier
clf_B = XGBClassifier(max_depth=5,booster='gblinear', feature_selector='shuffle')
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
    train_predict(clf_B, j[0], j[1], X_test, y_test)

Training a XGBClassifier using a training set size of 100. . .
Trained model in 0.0115 seconds
Made predictions in 0.0015 seconds.
Made predictions in 0.0030 seconds.
F1 score for training set: 0.7632.
F1 score for test set: 0.8054.
acc score for training set: 0.6400.
acc score for test set: 0.6947.


Training a XGBClassifier using a training set size of 200. . .
Trained model in 0.0346 seconds
Made predictions in 0.0030 seconds.
Made predictions in 0.0025 seconds.
F1 score for training set: 0.7815.
F1 score for test set: 0.7871.
acc score for training set: 0.6450.
acc score for test set: 0.6526.


Training a XGBClassifier using a training set size of 300. . .
Trained model in 0.0361 seconds
Made predictions in 0.0030 seconds.
Made predictions in 0.0015 seconds.
F1 score for training set: 0.8080.
F1 score for test set: 0.7821.
acc score for training set: 0.6800.
acc score for test set: 0.6421.




In [14]:
from sklearn.svm import SVC
clf_D = SVC(random_state=1, kernel='poly')
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
    train_predict(clf_D, j[0], j[1], X_test, y_test)


Training a SVC using a training set size of 100. . .
Trained model in 0.0075 seconds
Made predictions in 0.0035 seconds.
Made predictions in 0.0030 seconds.
F1 score for training set: 0.7662.
F1 score for test set: 0.7843.
acc score for training set: 0.6400.
acc score for test set: 0.6526.


Training a SVC using a training set size of 200. . .
Trained model in 0.0070 seconds
Made predictions in 0.0050 seconds.
Made predictions in 0.0025 seconds.
F1 score for training set: 0.7938.
F1 score for test set: 0.7922.
acc score for training set: 0.6650.
acc score for test set: 0.6632.


Training a SVC using a training set size of 300. . .
Trained model in 0.0140 seconds
Made predictions in 0.0080 seconds.
Made predictions in 0.0025 seconds.
F1 score for training set: 0.8127.
F1 score for test set: 0.7922.
acc score for training set: 0.6867.
acc score for test set: 0.6632.




In [15]:
from sklearn.neighbors import KNeighborsClassifier
clf_E = KNeighborsClassifier()
for j in [(X_train_100, y_train_100), (X_train_200, y_train_200), (X_train_300, y_train_300)]:
    train_predict(clf_E, j[0], j[1], X_test, y_test)

Training a KNeighborsClassifier using a training set size of 100. . .
Trained model in 0.0040 seconds
Made predictions in 0.0125 seconds.
Made predictions in 0.0105 seconds.
F1 score for training set: 0.7971.
F1 score for test set: 0.7883.
acc score for training set: 0.7200.
acc score for test set: 0.6947.


Training a KNeighborsClassifier using a training set size of 200. . .
Trained model in 0.0050 seconds
Made predictions in 0.0241 seconds.
Made predictions in 0.0120 seconds.
F1 score for training set: 0.8451.
F1 score for test set: 0.8085.
acc score for training set: 0.7800.
acc score for test set: 0.7158.


Training a KNeighborsClassifier using a training set size of 300. . .
Trained model in 0.0035 seconds
Made predictions in 0.0221 seconds.
Made predictions in 0.0065 seconds.
F1 score for training set: 0.8739.
F1 score for test set: 0.7737.
acc score for training set: 0.8133.
acc score for test set: 0.6737.




PARAMETER TUNING 

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return (f1_score(target.values, y_pred, pos_label='yes'), accuracy_score(target.values,y_pred))

parameters = {'max_depth':[1,2,3,4,5], 
              'min_samples_leaf':[1,2,3,4,5], 
              'min_samples_split':[2,3,4,5],
              'n_estimators':[19, 20, 21,22]}

clf = RandomForestClassifier()
f1_scorer = make_scorer(f1_score, pos_label='yes')

grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

In [25]:
clf_opt = grid_obj.best_estimator_

In [26]:
clf_opt

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
f1_train, acc_train = predict_labels(clf_opt, X_train, y_train)
f1_test, acc_test = predict_labels(clf_opt, X_test, y_test)
# Print the results of prediction for both training and testing
print("F1 score for training set: {:.4f}.".format(f1_train))
print("F1 score for test set: {:.4f}.".format(f1_test))
print("acc score for training set: {:.4f}.".format(acc_train))
print("acc score for test set: {:.4f}.".format(acc_test))
print("\n")

Made predictions in 0.0155 seconds.
Made predictions in 0.0000 seconds.
F1 score for training set: 0.8337.
F1 score for test set: 0.7945.
acc score for training set: 0.7300.
acc score for test set: 0.6842.




In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    score = clf.score(features, target.values)
    end = time()
    print("Score: ", score)
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')


# TODO: Create the parameters list you wish to tune
parameters = { "penalty":["l2","l1"], 
              # "tol":[0.00001, 0.0001, 0.001, 0.1, 1], 
               "C":[1,10,100,1000],
              }

# TODO: Initialize the classifier
clf = LogisticRegression()

# TODO: Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score, pos_label='yes')

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)
print(grid_obj)
# Get the estimator
clf = grid_obj.best_estimator_
print(clf)

# Report the final F1 score for training and testing after parameter tuning
print("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000], 'penalty': ['l2', 'l1']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(f1_score, pos_label=yes), verbose=0)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=Non

FOR MORE DETAILS VISIT: https://github.com/Dheeraj-1999/student-intervention-system