# Machine Learning Engineer Nanodegree
## Supervised Learning
## Capstone Project

## Exploring the Data

In [36]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from datetime import datetime
from sklearn.metrics import f1_score

# Read the data
data = pd.read_csv("features.csv")
print("Data read successfully!")

Data read successfully!


### Implementation: Data Exploration

In [37]:
# Calculate number of users
n_users = len(data.index)

# Calculate number of features
n_features = len(data.columns[:-1])

# Calculate paying students
n_paying = len(data[data.is_paying_student == 1])

# Calculate regular students
n_regular = len(data[data.is_paying_student == 0])

# Calculate conversion rate
conversion_rate = float(n_paying) / n_users * 100

# Print the results
print("Total number of users: {}".format(n_users))
print("Number of features: {}".format(n_features))
print("Number of paying students: {}".format(n_paying))
print("Number of regular students: {}".format(n_regular))
print("Conversion rate: {:.2f}%".format(conversion_rate))

Total number of users: 71640
Number of features: 50
Number of paying students: 5056
Number of regular students: 66584
Conversion rate: 7.06%


## Preparing the Data
### Identify feature and target columns

In [38]:
data = data.fillna(0)

# Extract feature columns
feature_cols = list(data.columns[6:-1])

# Extract target column 'passed'
target_col = data.columns[-1] 

# Show the list of columns
print("Feature columns:\n{}".format(feature_cols))
print("\nTarget column: {}".format(target_col))

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = data[feature_cols]
y_all = data[target_col]

# Show the feature information by printing the first five rows
print("\nFeature values:")
print(X_all.head(1).transpose())

Feature columns:
['count_visits', 'webinar_enrollments', 'free_course_enrollments', 'is_home', 'is_50back', 'is_signin', 'is_business', 'is_success', 'is_referrer_instagram', 'is_referrer_android', 'is_referrer_github', 'is_drive', 'is_jobs', 'is_referrer_computerworld', 'is_us', 'is_referrer_catracalivre', 'is_weekday', 'is_nanodegree_home', 'is_fcop_ud', 'is_hire_talent', 'is_catalog_nanodegrees', 'is_mobile', 'is_ai', 'is_legal', 'is_checkout', 'is_contact', 'is_referrer_live', 'is_referrer_linkedin', 'is_referrer_google', 'is_referrer_anhanguera', 'is_referrer_infomoney', 'is_referrer_cbsi', 'is_catalog_all', 'is_robotics', 'is_event', 'is_referrer_bing', 'is_payment', 'is_tech_requirements', 'is_android', 'is_ndop', 'is_referrer_facebook', 'is_fcop_st', 'is_referrer_tecmundo', 'is_fcop_cs']

Target column: is_paying_student

Feature values:
                              0
count_visits               40.0
webinar_enrollments         0.0
free_course_enrollments     4.0
is_home       

### Implementation: Training and Testing Data Split

In [39]:
# Import any additional functionality you may need here
from sklearn.model_selection import train_test_split

# Set the number of training points
num_train = int(0.75 * X_all.shape[0])

# Set the number of testing points
num_test = X_all.shape[0] - num_train

# Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=42)


# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 53730 samples.
Testing set has 17910 samples.


In [40]:
train_conversion_rate = y_train.sum() / y_train.shape[0]
test_conversion_rate = y_test.sum() / y_test.shape[0]
print("Training set conversion rate: {:.3f}".format(train_conversion_rate))
print("Testing set conversion rate: {:.3f}".format(test_conversion_rate))

Training set conversion rate: 0.071
Testing set conversion rate: 0.070


## Training and Evaluating Models
### Setup

In [41]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label=1)


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("\nTraining a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

### Implementation: Model Performance Metrics

In [50]:
# Import the three supervised learning models from sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize the three models
clf_A = GaussianNB()
clf_B = SGDClassifier() #svm.SVC(random_state=42)
clf_C = KNeighborsClassifier()

# Execute the 'train_predict' function for each classifier and each training set size
for clf in [clf_A, clf_B, clf_C]:
    print("\n{}: ".format(clf.__class__.__name__))
    for n in [int(X_train.shape[0] / 3), int( 2 / 3 * X_train.shape[0]), X_train.shape[0]]:
        train_predict(clf, X_train[:n], y_train[:n], X_test, y_test)


GaussianNB: 

Training a GaussianNB using a training set size of 17910. . .
Trained model in 0.0554 seconds
Made predictions in 0.0475 seconds.
F1 score for training set: 0.3774.
Made predictions in 0.0355 seconds.
F1 score for test set: 0.3770.

Training a GaussianNB using a training set size of 35820. . .
Trained model in 0.0526 seconds
Made predictions in 0.0415 seconds.
F1 score for training set: 0.3694.
Made predictions in 0.0146 seconds.
F1 score for test set: 0.3654.

Training a GaussianNB using a training set size of 53730. . .
Trained model in 0.0571 seconds
Made predictions in 0.0513 seconds.
F1 score for training set: 0.3547.
Made predictions in 0.0164 seconds.
F1 score for test set: 0.3475.

SGDClassifier: 

Training a SGDClassifier using a training set size of 17910. . .
Trained model in 0.0439 seconds
Made predictions in 0.0068 seconds.
F1 score for training set: 0.3971.
Made predictions in 0.0029 seconds.
F1 score for test set: 0.4030.

Training a SGDClassifier using a 

## Choosing the Best Model

In [46]:
# Initialize the classifier
clf = clf_C

### Implementation: Model Tuning

In [48]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing

# Create the parameters list you wish to tune
parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 2e-3, 1e-4], 'C': [1, 5, 10, 50, 100, 150, 200, 250]}]

# Create Cross-Validation Sets
cv_sets = ShuffleSplit(X_train.shape[0], test_size=0.4, random_state=42)

# Make an f1 scoring function using 'make_scorer'
f1_scorer = make_scorer(f1_score, pos_label=1)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, parameters, f1_scorer, cv=cv_sets)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print("\nTuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print(grid_obj.best_params_)

ValueError: Invalid parameter C for estimator KNeighborsClassifier. Check the list of available parameters with `estimator.get_params().keys()`.

In [49]:
clf.get_params().keys()

dict_keys(['metric_params', 'p', 'algorithm', 'n_neighbors', 'metric', 'weights', 'n_jobs', 'leaf_size'])