# Machine Learning Engineer Nanodegree
## Supervised Learning
## Capstone Project

## Exploring the Data

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from datetime import datetime
from sklearn.metrics import f1_score

# Read the data
data = pd.read_csv("features.csv")
print("Data read successfully!")

Data read successfully!


In [2]:
data.columns

Index(['Unnamed: 0', 'email', 'first_name', 'last_name', 'date_joined',
       'date_first_visit', 'count_visits', 'webinar_enrollments',
       'free_course_enrollments', 'is_home', 'is_50back', 'is_signin',
       'is_business', 'is_success', 'is_referrer_instagram',
       'is_referrer_android', 'is_referrer_github', 'is_drive', 'is_jobs',
       'is_referrer_computerworld', 'is_us', 'is_referrer_catracalivre',
       'is_weekday', 'is_nanodegree_home', 'is_fcop_ud', 'is_hire_talent',
       'is_catalog_nanodegrees', 'is_mobile', 'is_ai', 'is_legal',
       'is_checkout', 'is_contact', 'is_referrer_live', 'is_referrer_linkedin',
       'is_referrer_google', 'is_referrer_anhanguera', 'is_referrer_infomoney',
       'is_referrer_cbsi', 'is_catalog_all', 'is_robotics', 'is_event',
       'is_referrer_bing', 'is_payment', 'is_tech_requirements', 'is_android',
       'is_ndop', 'is_referrer_facebook', 'is_fcop_st', 'is_referrer_tecmundo',
       'is_fcop_cs', 'is_paying_student'],
      

In [3]:
data = data[~data['count_visits'].isnull()]

In [4]:
from datetime import datetime

data['date_first_visit'] = data.apply(lambda row: datetime.strptime(row['date_first_visit'].split(".")[0], '%Y-%m-%d %H:%M:%S'), axis=1)
data['date_joined'] = data.apply(lambda row: datetime.strptime(row['date_joined'].split(".")[0], '%Y-%m-%d %H:%M:%S'), axis=1)

## Cutting old data

In [5]:
data = data[data['date_first_visit'] > datetime.strptime('2017-04-03', '%Y-%m-%d')]

## Adding Extra features

In [6]:
data['window_first_visit_to_signup'] = data.apply(lambda row: (row['date_joined'] - row['date_first_visit']).days, axis=1)


In [7]:
import re

def get_domain(row):
    domain = re.search("@[\w.]+", row['email'])
    return domain.group()

data['email_domain'] = data.apply(get_domain, axis=1)

temp = data[['email', 'email_domain']].groupby('email_domain').count()
temp['importance'] = temp['email'] / sum(temp['email'])
temp.sort_values('email', ascending=False).head(10)
top_domains = temp.sort_values('email', ascending=False).head(20).index.tolist()

data['email_domain'] = data.apply(lambda row: row['email_domain'] if row['email_domain'] in top_domains else 'others', axis=1)

In [8]:
df_hot = pd.get_dummies(data['email_domain'])
data = pd.concat([data, df_hot], axis=1)
data.drop('email_domain', axis=1, inplace=True)

In [9]:
data.columns

Index(['Unnamed: 0', 'email', 'first_name', 'last_name', 'date_joined',
       'date_first_visit', 'count_visits', 'webinar_enrollments',
       'free_course_enrollments', 'is_home', 'is_50back', 'is_signin',
       'is_business', 'is_success', 'is_referrer_instagram',
       'is_referrer_android', 'is_referrer_github', 'is_drive', 'is_jobs',
       'is_referrer_computerworld', 'is_us', 'is_referrer_catracalivre',
       'is_weekday', 'is_nanodegree_home', 'is_fcop_ud', 'is_hire_talent',
       'is_catalog_nanodegrees', 'is_mobile', 'is_ai', 'is_legal',
       'is_checkout', 'is_contact', 'is_referrer_live', 'is_referrer_linkedin',
       'is_referrer_google', 'is_referrer_anhanguera', 'is_referrer_infomoney',
       'is_referrer_cbsi', 'is_catalog_all', 'is_robotics', 'is_event',
       'is_referrer_bing', 'is_payment', 'is_tech_requirements', 'is_android',
       'is_ndop', 'is_referrer_facebook', 'is_fcop_st', 'is_referrer_tecmundo',
       'is_fcop_cs', 'is_paying_student', 'window

### Reorder columns, remove temporary columns, and leave target as last

In [10]:
temp = data.columns.tolist()
for index, item in enumerate(temp):
    print("%d. %s" % (index, item))

0. Unnamed: 0
1. email
2. first_name
3. last_name
4. date_joined
5. date_first_visit
6. count_visits
7. webinar_enrollments
8. free_course_enrollments
9. is_home
10. is_50back
11. is_signin
12. is_business
13. is_success
14. is_referrer_instagram
15. is_referrer_android
16. is_referrer_github
17. is_drive
18. is_jobs
19. is_referrer_computerworld
20. is_us
21. is_referrer_catracalivre
22. is_weekday
23. is_nanodegree_home
24. is_fcop_ud
25. is_hire_talent
26. is_catalog_nanodegrees
27. is_mobile
28. is_ai
29. is_legal
30. is_checkout
31. is_contact
32. is_referrer_live
33. is_referrer_linkedin
34. is_referrer_google
35. is_referrer_anhanguera
36. is_referrer_infomoney
37. is_referrer_cbsi
38. is_catalog_all
39. is_robotics
40. is_event
41. is_referrer_bing
42. is_payment
43. is_tech_requirements
44. is_android
45. is_ndop
46. is_referrer_facebook
47. is_fcop_st
48. is_referrer_tecmundo
49. is_fcop_cs
50. is_paying_student
51. window_first_visit_to_signup
52. @bol.com.br
53. @cin.ufpe.b

In [11]:
cols = list(data)
cols[72], cols[50] = cols[50], cols[72]
data = data.ix[:,cols]
data.columns

Index(['Unnamed: 0', 'email', 'first_name', 'last_name', 'date_joined',
       'date_first_visit', 'count_visits', 'webinar_enrollments',
       'free_course_enrollments', 'is_home', 'is_50back', 'is_signin',
       'is_business', 'is_success', 'is_referrer_instagram',
       'is_referrer_android', 'is_referrer_github', 'is_drive', 'is_jobs',
       'is_referrer_computerworld', 'is_us', 'is_referrer_catracalivre',
       'is_weekday', 'is_nanodegree_home', 'is_fcop_ud', 'is_hire_talent',
       'is_catalog_nanodegrees', 'is_mobile', 'is_ai', 'is_legal',
       'is_checkout', 'is_contact', 'is_referrer_live', 'is_referrer_linkedin',
       'is_referrer_google', 'is_referrer_anhanguera', 'is_referrer_infomoney',
       'is_referrer_cbsi', 'is_catalog_all', 'is_robotics', 'is_event',
       'is_referrer_bing', 'is_payment', 'is_tech_requirements', 'is_android',
       'is_ndop', 'is_referrer_facebook', 'is_fcop_st', 'is_referrer_tecmundo',
       'is_fcop_cs', 'others', 'window_first_visi

### Implementation: Data Exploration

In [12]:
# Calculate number of users
n_users = len(data.index)

# Calculate number of features
n_features = len(data.columns[:-1])

# Calculate paying students
n_paying = len(data[data.is_paying_student == 1])

# Calculate regular students
n_regular = len(data[data.is_paying_student == 0])

# Calculate conversion rate
conversion_rate = float(n_paying) / n_users * 100

# Print the results
print("Total number of users: {}".format(n_users))
print("Number of features: {}".format(n_features))
print("Number of paying students: {}".format(n_paying))
print("Number of regular students: {}".format(n_regular))
print("Conversion rate: {:.2f}%".format(conversion_rate))

Total number of users: 5133
Number of features: 72
Number of paying students: 605
Number of regular students: 4528
Conversion rate: 11.79%


In [13]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,5133.0,59437.898890,15435.775082,26.0,53037.0,67908.0,69824.0,71590.0
count_visits,5133.0,70.337814,584.207522,1.0,7.0,14.0,35.0,27856.0
webinar_enrollments,5133.0,0.136762,0.522672,0.0,0.0,0.0,0.0,10.0
free_course_enrollments,5133.0,0.721995,3.134684,0.0,0.0,0.0,0.0,64.0
is_home,5133.0,13.054744,127.486779,0.0,0.0,1.0,5.0,6576.0
is_50back,5133.0,0.218780,4.143173,0.0,0.0,0.0,0.0,186.0
is_signin,5133.0,6.103059,98.717545,0.0,0.0,1.0,3.0,6784.0
is_business,5133.0,0.021235,0.429042,0.0,0.0,0.0,0.0,25.0
is_success,5133.0,0.028833,0.576855,0.0,0.0,0.0,0.0,36.0
is_referrer_instagram,5133.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


In [14]:
data[data['is_paying_student'] == 0].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,4528.0,58998.033790,16111.848560,26.0,52895.5,68081.0,69968.25,71590.0
count_visits,4528.0,73.541519,618.165209,1.0,8.0,15.0,36.00,27856.0
webinar_enrollments,4528.0,0.131846,0.535526,0.0,0.0,0.0,0.00,10.0
free_course_enrollments,4528.0,0.812721,3.323218,0.0,0.0,0.0,0.00,64.0
is_home,4528.0,14.217094,135.593820,0.0,0.0,2.0,5.00,6576.0
is_50back,4528.0,0.215327,4.265205,0.0,0.0,0.0,0.00,186.0
is_signin,4528.0,6.657686,105.038566,0.0,0.0,1.0,4.00,6784.0
is_business,4528.0,0.017226,0.418971,0.0,0.0,0.0,0.00,25.0
is_success,4528.0,0.023189,0.573813,0.0,0.0,0.0,0.00,36.0
is_referrer_instagram,4528.0,0.000000,0.000000,0.0,0.0,0.0,0.00,0.0


In [15]:
data[data['is_paying_student'] == 1].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,605.0,62729.980165,8154.355591,4146.0,53573.0,67628.0,68342.0,71570.0
count_visits,605.0,46.360331,187.589985,1.0,5.0,12.0,27.0,3277.0
webinar_enrollments,605.0,0.173554,0.412505,0.0,0.0,0.0,0.0,2.0
free_course_enrollments,605.0,0.042975,0.439903,0.0,0.0,0.0,0.0,8.0
is_home,605.0,4.355372,14.436352,0.0,0.0,1.0,3.0,242.0
is_50back,605.0,0.244628,3.082689,0.0,0.0,0.0,0.0,64.0
is_signin,605.0,1.952066,9.408465,0.0,0.0,0.0,1.0,185.0
is_business,605.0,0.051240,0.497363,0.0,0.0,0.0,0.0,9.0
is_success,605.0,0.071074,0.597932,0.0,0.0,0.0,0.0,9.0
is_referrer_instagram,605.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


## Preparing the Data
### Identify feature and target columns

In [16]:
data = data.fillna(0)

# Extract feature columns
feature_cols = list(data.columns[6:-1])

# Extract target column 'passed'
target_col = data.columns[-1] 

# Show the list of columns
print("Feature columns:\n{}".format(feature_cols))
print("\nTarget column: {}".format(target_col))

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = data[feature_cols]
y_all = data[target_col]

Feature columns:
['count_visits', 'webinar_enrollments', 'free_course_enrollments', 'is_home', 'is_50back', 'is_signin', 'is_business', 'is_success', 'is_referrer_instagram', 'is_referrer_android', 'is_referrer_github', 'is_drive', 'is_jobs', 'is_referrer_computerworld', 'is_us', 'is_referrer_catracalivre', 'is_weekday', 'is_nanodegree_home', 'is_fcop_ud', 'is_hire_talent', 'is_catalog_nanodegrees', 'is_mobile', 'is_ai', 'is_legal', 'is_checkout', 'is_contact', 'is_referrer_live', 'is_referrer_linkedin', 'is_referrer_google', 'is_referrer_anhanguera', 'is_referrer_infomoney', 'is_referrer_cbsi', 'is_catalog_all', 'is_robotics', 'is_event', 'is_referrer_bing', 'is_payment', 'is_tech_requirements', 'is_android', 'is_ndop', 'is_referrer_facebook', 'is_fcop_st', 'is_referrer_tecmundo', 'is_fcop_cs', 'others', 'window_first_visit_to_signup', '@bol.com.br', '@cin.ufpe.br', '@globo.com', '@gmail.com', '@gmail.com.br', '@hotmail.com', '@hotmail.com.br', '@icloud.com', '@id.uff.br', '@live.com'

### Implementation: Training and Testing Data Split

In [17]:
# Import any additional functionality you may need here
from sklearn.model_selection import train_test_split

# Set the number of training points
num_train = int(0.75 * X_all.shape[0])

# Set the number of testing points
num_test = X_all.shape[0] - num_train

# Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=42)


# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 3849 samples.
Testing set has 1284 samples.


In [18]:
train_conversion_rate = y_train.sum() / y_train.shape[0]
test_conversion_rate = y_test.sum() / y_test.shape[0]
print("Training set conversion rate: {:.3f}".format(train_conversion_rate))
print("Testing set conversion rate: {:.3f}".format(test_conversion_rate))

Training set conversion rate: 0.120
Testing set conversion rate: 0.111


## Training and Evaluating Models
### Setup

In [19]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label=1)


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("\nTraining a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

### Implementation: Model Performance Metrics

In [20]:
# Import the three supervised learning models from sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# Initialize the three models
clf_A = GaussianNB()
clf_B = SGDClassifier() #svm.SVC(random_state=42)
clf_C = KNeighborsClassifier()
clf_D = xgb.XGBClassifier()

# Execute the 'train_predict' function for each classifier and each training set size
for clf in [clf_A, clf_B, clf_C, clf_D]:
    print("\n{}: ".format(clf.__class__.__name__))
    for n in [int(X_train.shape[0] / 3), int( 2 / 3 * X_train.shape[0]), X_train.shape[0]]:
        train_predict(clf, X_train[:n], y_train[:n], X_test, y_test)


GaussianNB: 

Training a GaussianNB using a training set size of 1283. . .
Trained model in 0.0020 seconds
Made predictions in 0.0016 seconds.
F1 score for training set: 0.2523.
Made predictions in 0.0013 seconds.
F1 score for test set: 0.2276.

Training a GaussianNB using a training set size of 2566. . .
Trained model in 0.0029 seconds
Made predictions in 0.0027 seconds.
F1 score for training set: 0.2521.
Made predictions in 0.0016 seconds.
F1 score for test set: 0.2383.

Training a GaussianNB using a training set size of 3849. . .
Trained model in 0.0041 seconds
Made predictions in 0.0038 seconds.
F1 score for training set: 0.2506.
Made predictions in 0.0013 seconds.
F1 score for test set: 0.2366.

SGDClassifier: 

Training a SGDClassifier using a training set size of 1283. . .
Trained model in 0.0054 seconds
Made predictions in 0.0014 seconds.
F1 score for training set: 0.3858.
Made predictions in 0.0006 seconds.
F1 score for test set: 0.3880.

Training a SGDClassifier using a trai

## Choosing the Best Model

In [21]:
# Initialize the classifier
clf = clf_D

### Implementation: Model Tuning

In [22]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing

# Create the parameters list you wish to tune
#parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 2e-3, 1e-4], 'C': [1, 5, 10, 50, 100, 150, 200, 250]}]
parameters = [{'learning_rate': [0.1, 0.3, 0.5, 0.7]}] #, 
              #'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]}]
              #'min_child_weight': [1, 2, 3, 4, 5],
              #'max_delta_step': [0, 1, 2, 3, 4, 5],
              #'subsample': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
              #'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
              #'colsample_bylevel': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}]

# Create Cross-Validation Sets
cv_sets = ShuffleSplit(X_train.shape[0], test_size=0.4, random_state=42)

# Make an f1 scoring function using 'make_scorer'
f1_scorer = make_scorer(f1_score, pos_label=1)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, parameters, f1_scorer, cv=cv_sets)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print("\nTuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print(grid_obj.best_params_)

KeyboardInterrupt: 

In [None]:
clf.get_params().keys()