
## Project: Building a Student Intervention System using Machine Learning

## Exploring the Data
Importing necessary libraries and reading our dataset

In [83]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score

# Read student data
#student_data = pd.read_csv("/content/Demographic.csv")
#student_data = pd.read_csv("/content/Personal.csv")
#student_data = pd.read_csv("/content/Institutional.csv")
student_data = pd.read_csv("/content/student-data-mat-por.csv")
print("Student data read successfully!")

Student data read successfully!


### Implementation: Data Exploration
Investigating the dataset to determine how many students we have information on, and learn about the graduation rate among these students. In the code cell below, we will compute the following:
- The total number of students, `n_students`.
- The total number of features for each student, `n_features`.
- The total number of students in maths course, `n_mat`.
- The total number of students in portuguese course, `n_por`.
- The number of those students who passed, `n_passed`.
- The number of those students who failed, `n_failed`.
- The graduation rate of the class, `grad_rate`, in percent (%).


In [84]:
# number of students
n_students = len(student_data.index)

# number of features, excluding the label column
n_features = len(student_data.columns) - 1

# passing students
n_passed = len(student_data[student_data['passed'] == "yes"])

# failing students.  
n_failed = len(student_data[student_data['passed'] == "no"])

# number of students in maths course
#n_mat = len(student_data[student_data['course'] == "mat"])

# number of students in portuguese course
#n_por = len(student_data[student_data['course'] == "por"])

# graduation rate
grad_rate = n_passed / float(n_students) * 100.0

print("Total number of students: {}".format(n_students))
print("Number of features: {}".format(n_features))
print("Number of students in maths course: {}".format(n_mat))
print("Number of students in portuguese course: {}".format(n_por))
print("Number of students who passed: {}".format(n_passed))
print("Number of students who failed: {}".format(n_failed))
print("Graduation rate of the class: {:.2f}%".format(grad_rate))

print("\nF1 score for predicting all 'yes': {:.4f}".format(
    f1_score(y_true = ['yes']*n_passed + ['no']*n_failed, y_pred = ['yes']*n_students, pos_label='yes', average='binary')))

Total number of students: 1044
Number of features: 31
Number of students in maths course: 395
Number of students in portuguese course: 649
Number of students who passed: 814
Number of students who failed: 230
Graduation rate of the class: 77.97%

F1 score for predicting all 'yes': 0.8762


## Preparing the Data
Preparation of the data for modeling, training and testing.

### Identify feature and target columns
Separating the student data into feature and target columns to see if any features are non-numeric.

In [85]:
# Extracting feature columns
feature_cols = list(student_data.columns[:-1])

# Extracting target column 'passed'
target_col = student_data.columns[-1] 

# list of columns
print("Feature columns:\n{}".format(feature_cols))
print("\nTarget column:+ {}".format(target_col))

# Separating the data into feature data and target data (X_all and y_all, respectively)
X_all = student_data[feature_cols]
y_all = student_data[target_col]

print("\nFeature values:")
print(X_all.head())

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'course']

Target column:+ passed

Feature values:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  internet romantic  famrel  freetime  goout Dalc Walc health absences course  
0       no       no       4         3      4    1    1 

### Preprocess Feature Columns
Preprocessing of dataset to take care of non-numeric columns and also columns with have more than two value

In [86]:
def preprocess_features(X):
    ''' Preprocessesing the student data and converting non-numeric binary variables into
        binary (0/1) variables. Converting categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (50 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'course_mat', 'course_por']


### Implementation: Training and Testing Data Split
Spliting the dataset into training and test sets.
Using approx 800 training points and 200 testing points (80:20 ratio)


In [87]:
from sklearn.model_selection import train_test_split

# Setting the number of training points
num_train = 830

# Setting the number of testing points
num_test = X_all.shape[0] - num_train

# Shuffling and splitting the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all,y_all,train_size=num_train,test_size=num_test,random_state=0,stratify=y_all)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 830 samples.
Testing set has 214 samples.


## Training and Evaluating Models
Selecting and evaluating three supervised learning models

- Gaussian Naive Bayes (GaussianNB)
- Decision Trees
- Support Vector Machines (SVM)

In [88]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    tdiff = end - start
    
    print("Trained model in {:.4f} seconds".format(tdiff))
    return tdiff

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    tdiff = end - start

    print("Made predictions in {:.4f} seconds.".format(tdiff))
    return f1_score(target.values, y_pred, pos_label='yes'),tdiff


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_time = train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing, and return them
    f1_train, prediction_time_train = predict_labels(clf, X_train, y_train)
    print("F1 score for training set: {:.4f}.".format(f1_train))
    f1_test, prediction_time_test = predict_labels(clf, X_test, y_test)
    print("F1 score for test set: {:.4f}.".format(f1_test))
    return train_time,prediction_time_test,f1_train,f1_test

In [89]:
# Import the three supervised learning models from sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression

# Get a benchmark score
print("F1 score for predicting all \"yes\" on test set: {:.4f}".format(
    f1_score(y_test, ['yes']*len(y_test), pos_label='yes', average='binary')))

# Initialize and setup the three models
classifiers = [
    { 'name' : "GaussianNB", 'clf' : GaussianNB() },
    { 'name' : "Decision Tree", 'clf' : DecisionTreeClassifier(criterion="entropy",random_state=0) },
    { 'name' : "Support Vector Machines", 'clf' : SVC(random_state=0) },
]

# Resample and store our training sets in specific sizes, in training_sets for 100,200 and 300
training_sets = []
for train_size in range(100,830,100):
    X_res, y_res = resample(X_train,y_train,n_samples=train_size,random_state=0)
    training_sets.append ({ 'size' : train_size, 'X_train' : X_res, 'y_train' : y_res })

# Loop through each classifer,  and each training set size and test our model
for clfData in classifiers:
    clfData['results'] = []
    for tset in training_sets:
        train_time,prediction_time_test,f1_train,f1_test = \
        train_predict(clfData['clf'], tset['X_train'], tset['y_train'], X_test, y_test)
        
        # Store our results
        clfData['results'].append({'train_time' : train_time, 'prediction_time_test' : prediction_time_test,
                                   'f1_train' : f1_train, 'f1_test' : f1_test, 'size' : tset['size']})
        
# Generate markdown
print('----- MARKDOWN -----')
i = 0
for clfData in classifiers:
    i += 1
    print('** Classifer {} - {}**\n'.format(i,clfData['name']))
    print(' TST: Training Set Size\n'
          ' TT : Training Time \n'
          ' PT: Prediction Time(test) \n'
          ' F1_train : F1 Score (train) \n'
          ' F1_test : F1 Score (test)\n')
    print("| TST |   TT   |   PT   | F1_train | F1_test |")
    for result in clfData['results']:
        print('| {} | {:.4f} | {:.4f} | {:.4f} | {:.4f} |'.format(result['size'], result['train_time'], 
                                                       result['prediction_time_test'],result['f1_train'],result['f1_test']))
    print('\n')

F1 score for predicting all "yes" on test set: 0.8766
Training a GaussianNB using a training set size of 100. . .
Trained model in 0.0037 seconds
Made predictions in 0.0028 seconds.
F1 score for training set: 0.5102.
Made predictions in 0.0033 seconds.
F1 score for test set: 0.4344.
Training a GaussianNB using a training set size of 200. . .
Trained model in 0.0043 seconds
Made predictions in 0.0028 seconds.
F1 score for training set: 0.4670.
Made predictions in 0.0029 seconds.
F1 score for test set: 0.3585.
Training a GaussianNB using a training set size of 300. . .
Trained model in 0.0044 seconds
Made predictions in 0.0029 seconds.
F1 score for training set: 0.4291.
Made predictions in 0.0028 seconds.
F1 score for test set: 0.3398.
Training a GaussianNB using a training set size of 400. . .
Trained model in 0.0043 seconds
Made predictions in 0.0032 seconds.
F1 score for training set: 0.8514.
Made predictions in 0.0029 seconds.
F1 score for test set: 0.7975.
Training a GaussianNB usin

### Tabular Results
Edit the cell below to see how a table can be designed in [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet#tables). You can record your results from above in the tables provided.

** Classifer 1 - GaussianNB**

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 100 | 0.0040 | 0.0032 | 0.5102 | 0.4344 |
| 200 | 0.0047 | 0.0029 | 0.4670 | 0.3585 |
| 300 | 0.0088 | 0.0037 | 0.4291 | 0.3398 |
| 400 | 0.0045 | 0.0025 | 0.8514 | 0.7975 |
| 500 | 0.0097 | 0.0025 | 0.8763 | 0.8135 |
| 600 | 0.0049 | 0.0026 | 0.8780 | 0.8348 |
| 700 | 0.0058 | 0.0025 | 0.8804 | 0.8348 |
| 800 | 0.0050 | 0.0027 | 0.8811 | 0.8303 |

** Classifer 2 - Decision Tree**

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 100 | 0.0043 | 0.0024 | 1.0000 | 0.8012 |
| 200 | 0.0044 | 0.0023 | 1.0000 | 0.8131 |
| 300 | 0.0056 | 0.0030 | 1.0000 | 0.8354 |
| 400 | 0.0062 | 0.0028 | 1.0000 | 0.8520 |
| 500 | 0.0062 | 0.0041 | 1.0000 | 0.7890 |
| 600 | 0.0098 | 0.0034 | 1.0000 | 0.8343 |
| 700 | 0.0111 | 0.0019 | 1.0000 | 0.8378 |
| 800 | 0.0095 | 0.0018 | 1.0000 | 0.8393 |

** Classifer 3 - Support Vector Machines**

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 100 | 0.0033 | 0.0036 | 0.8372 | 0.8766 |
| 200 | 0.0058 | 0.0058 | 0.8506 | 0.8766 |
| 300 | 0.0085 | 0.0059 | 0.8550 | 0.8766 |
| 400 | 0.0158 | 0.0070 | 0.8604 | 0.8766 |
| 500 | 0.0180 | 0.0082 | 0.8713 | 0.8766 |
| 600 | 0.0271 | 0.0097 | 0.8679 | 0.8766 |
| 700 | 0.0265 | 0.0115 | 0.8767 | 0.8766 |
| 800 | 0.0318 | 0.0129 | 0.8787 | 0.8766 |


Support Vector Machines (SVM) basically work by trying to identify the reasons why a student switches from a 'fail' to a 'pass'.  In other words, it isn't very concerned with people who are comfortably passing or failing, but rather what the differences are between students who have only *just* passed, or only *just* failed.   It will try to establish this *boundary* between marginal students as clearly as possible, thus enabling it to figure out what the most important factors are in getting students to pass.



### Implementation: Model Tuning
Using grid search (`GridSearchCV`) with at least one important parameter tuned with at least 3 different values. You will need to use the entire training set for this. 

In [90]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from IPython.display import display
def f1_wrap(y_true, y_predict):
    return f1_score(y_true, y_predict, pos_label='yes')

# Create the parameters list you wish to tune.  Warning, takes ~15 seconds to compute!
parameters = {'C':range(1,6),'kernel':['linear','poly','rbf','sigmoid'],'degree':range(1,6)}

# Initialize the classifier
clf = SVC(random_state=0)

# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_wrap)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)
# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)
# Get the estimator
clf = grid_obj.best_estimator_

# Print the final parameters
df = pd.DataFrame(grid_obj.cv_results_).sort_values('mean_test_score').tail()
display(df)
print("Parameters for the optimal model: {}".format(clf.get_params()))
# Report the final F1 score for training and testing after parameter tuning
print("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)[0]))
print("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)[0]))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
46,0.024544,0.001137,0.009588,0.000502,3,2,rbf,"{'C': 3, 'degree': 2, 'kernel': 'rbf'}",0.895833,0.893471,0.875,0.876325,0.878893,0.883904,0.008896,2
54,0.023203,0.000995,0.00902,0.000331,3,4,rbf,"{'C': 3, 'degree': 4, 'kernel': 'rbf'}",0.895833,0.893471,0.875,0.876325,0.878893,0.883904,0.008896,2
42,0.02291,0.000349,0.009128,0.000267,3,1,rbf,"{'C': 3, 'degree': 1, 'kernel': 'rbf'}",0.895833,0.893471,0.875,0.876325,0.878893,0.883904,0.008896,2
50,0.022827,0.00043,0.009335,0.001038,3,3,rbf,"{'C': 3, 'degree': 3, 'kernel': 'rbf'}",0.895833,0.893471,0.875,0.876325,0.878893,0.883904,0.008896,2
81,0.018652,0.000825,0.005896,0.000316,5,1,poly,"{'C': 5, 'degree': 1, 'kernel': 'poly'}",0.893471,0.890411,0.880282,0.876325,0.885813,0.88526,0.006307,1


Parameters for the optimal model: {'C': 5, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 1, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': 0, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Made predictions in 0.0193 seconds.
Tuned model has a training F1 score of 0.8859.
Made predictions in 0.0061 seconds.
Tuned model has a testing F1 score of 0.8798.



- Final F<sub>1</sub> score for training: 0.8859
- Final F<sub>1</sub> score for testing: 0.8798
- Previous F<sub>1</sub> score for training: 0.8787
- Previous F<sub>1</sub> score for testing: 0.8766