# Predictive Modeling

In [1]:
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import datetime
import pprint

In [2]:
path = "/Users/jasonzhou/Documents/GitHub/Portfolio/SpringBoard/Exercises/ultimate_challenge"
os.chdir(path)

df = pd.read_json("ultimate_data_challenge.json")

In [3]:
df

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.10,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.00,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.00,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,King's Landing,0,2014-01-25,5.0,1.00,2014-06-05,iPhone,0.0,False,100.0,5.63,4.2
49996,Astapor,1,2014-01-24,,1.00,2014-01-25,iPhone,0.0,False,0.0,0.00,4.0
49997,Winterfell,0,2014-01-31,5.0,1.00,2014-05-22,Android,0.0,True,100.0,3.86,5.0
49998,Astapor,2,2014-01-14,3.0,1.00,2014-01-15,iPhone,0.0,False,100.0,4.58,3.5


# Data Cleaning

In [4]:
# Checking for missing values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    50000 non-null  object 
 1   trips_in_first_30_days  50000 non-null  int64  
 2   signup_date             50000 non-null  object 
 3   avg_rating_of_driver    41878 non-null  float64
 4   avg_surge               50000 non-null  float64
 5   last_trip_date          50000 non-null  object 
 6   phone                   49604 non-null  object 
 7   surge_pct               50000 non-null  float64
 8   ultimate_black_user     50000 non-null  bool   
 9   weekday_pct             50000 non-null  float64
 10  avg_dist                50000 non-null  float64
 11  avg_rating_by_driver    49799 non-null  float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 4.2+ MB


In [5]:
# We're just going to go ahead and fill null rating values with their respective means

meanofrating = df['avg_rating_of_driver'].mean()
meanbyrating = df['avg_rating_by_driver'].mean()

df['avg_rating_of_driver'] = df['avg_rating_of_driver'].fillna(meanofrating)
df['avg_rating_by_driver'] = df['avg_rating_by_driver'].fillna(meanbyrating)

In [6]:
# See what the current distribution of phone is between iPhones and Androids

df['phone'].value_counts()

iPhone     34582
Android    15022
Name: phone, dtype: int64

In [7]:
# Helper function that randomly returns "Android" or "iPhone" based on weighted probabilities

import random

def randomphone():
    roll = random.random()
    if roll > 0.7:
        return "Android"
    else:
        return "iPhone"

In [8]:
# Seems like the ratio is essentially 35/15, or 70/30 iPhones to Androids. We'll randomly fill in the missing phone
# values according to these proportions

df['phone'] = df['phone'].fillna(randomphone())

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    50000 non-null  object 
 1   trips_in_first_30_days  50000 non-null  int64  
 2   signup_date             50000 non-null  object 
 3   avg_rating_of_driver    50000 non-null  float64
 4   avg_surge               50000 non-null  float64
 5   last_trip_date          50000 non-null  object 
 6   phone                   50000 non-null  object 
 7   surge_pct               50000 non-null  float64
 8   ultimate_black_user     50000 non-null  bool   
 9   weekday_pct             50000 non-null  float64
 10  avg_dist                50000 non-null  float64
 11  avg_rating_by_driver    50000 non-null  float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 4.2+ MB


# Data Visualization

In [10]:
# Our focus of this project is to examine user retention, and two columns help with that: signup_date and 
# last_trip_date. We should create a new column that directly tells us the time difference between these two dates

df['signup_date'][0][8:]

'25'

In [11]:
# Seeing the range of dates we have in the data

df['signup_date'].value_counts()

2014-01-18    2948
2014-01-25    2885
2014-01-11    2402
2014-01-24    2284
2014-01-17    2149
2014-01-31    2100
2014-01-19    2028
2014-01-10    2021
2014-01-06    1763
2014-01-01    1737
2014-01-26    1708
2014-01-23    1606
2014-01-07    1486
2014-01-04    1485
2014-01-30    1471
2014-01-09    1433
2014-01-16    1431
2014-01-22    1369
2014-01-05    1343
2014-01-12    1334
2014-01-20    1295
2014-01-28    1284
2014-01-08    1275
2014-01-27    1236
2014-01-21    1234
2014-01-03    1213
2014-01-29    1197
2014-01-14    1120
2014-01-15    1110
2014-01-13    1049
2014-01-02    1004
Name: signup_date, dtype: int64

In [12]:
df['last_trip_date'].sort_values()

31425    2014-01-01
10729    2014-01-01
40336    2014-01-01
34828    2014-01-01
37295    2014-01-01
            ...    
45126    2014-07-01
38651    2014-07-01
14473    2014-07-01
22735    2014-07-01
45357    2014-07-01
Name: last_trip_date, Length: 50000, dtype: object

In [13]:
# Helper function that determines difference in time between signup date and date of last trip. We're only really 
# interested in the month difference as a general measure

def monthsApart(string1, string2):
    year1 = int(string1[0:4])
    month1 = int(string1[5:7])
    day1 = int(string1[8:])
    
    year2 = int(string2[0:4])
    month2 = int(string2[5:7])
    day2 = int(string2[8:])
    
    date1 = datetime.datetime(year1, month1, day1)
    date2 = datetime.datetime(year2, month2, day2)
    
    difference = date2 - date1
    
    return difference.months

In [14]:
def daysApart(string1, string2):
    year1 = int(string1[0:4])
    month1 = int(string1[5:7])
    day1 = int(string1[8:])
    
    year2 = int(string2[0:4])
    month2 = int(string2[5:7])
    day2 = int(string2[8:])
    
    date1 = datetime.datetime(year1, month1, day1)
    date2 = datetime.datetime(year2, month2, day2)
    
    difference = date2 - date1
    
    return difference.days

Because the data set was collected up until July 1st, any users who booked a ride any time in June or exactly on July 1st would be considered an active user according to the provided definition of "active user". We'll define a function that determines this:

In [15]:
def isActive(string):
    month = int(string[5:7])
    if month >= 6:
        return 1
    else:
        return 0

In [16]:
monthsretained = []
daysretained = []
isactive = []

for i in range(len(df)):
    monthsretained.append(monthsApart(df['signup_date'][i], df['last_trip_date'][i]))
    daysretained.append(daysApart(df['signup_date'][i], df['last_trip_date'][i]))
    isactive.append(isActive(df['last_trip_date'][i]))

AttributeError: 'datetime.timedelta' object has no attribute 'months'

In [None]:
df['monthsretained'] = monthsretained
df['daysretained'] = daysretained
df['isActive'] = isactive

In [None]:
ubu = []

for i in range(len(df)):
    if df['ultimate_black_user'][i]:
        ubu.append(1)
    else:
        ubu.append(0)

In [None]:
df['ultimate_black_user'] = ubu

In [None]:
# Plot Histograms

_ = df.hist(column=['trips_in_first_30_days', 'avg_rating_of_driver', 'avg_surge',
                    'surge_pct', 'weekday_pct', 'avg_dist', 'avg_rating_by_driver',
                    'daysretained', 'isActive', 'ultimate_black_user'],
            figsize=(12, 12))

1) Most users have 12 or less trips in their first 30 days

2) Most users score their drivers a 4 or 5

3) Most users don't call for a trip if there's a surge going on

4) Essentially the same column as 3)

5) Many users only call for rides almost exclusively during the week or on weekends. 

6) Most users take shorter trips, with a few outliers skewing the shape of the graph

7) Similarly to 2), most drivers score their passengers/users a 4 or 5. 

8) A large amount of users do not use the service for beyond 20 or so days. Interestingly enough, from there there are more users that use the service for a greater amount of time. Until the fall off at around 160 days.

9) More users are "inactive" than active

10) Most users did not take an Ultimate Black in their first 30 days

In [None]:
# Getting a separate histogram for monthsretained so we can assign an accurate binsize

_ = df.hist(column='monthsretained', bins=7, figsize=(6, 6))

About 20% of users do not continue to use the service for more than a month, with around 35% of users using the service for 5 months. Very very few users use the service for exactly 6 months. 

We are interesting in determining our retention rate, which is as follows:

In [None]:
(len(df[df['isActive'] == 1]) / len(df)) * 100

# Exploratory Data Analysis

We are to build a model that determines if a user will be active on their 6th month since signing up. Conveniently for us, we've already introduced an extra column that tracks this, 'monthsretained'. Based on this, we can assign a binary column that simply tells us if the user's 'monthsretained' is greater than equal to 6. From there we can model this problem as a classification problem. 

In [None]:
# Having used the service on your 6th month means that you've used the service for 5 months

active6 = []

for i in range(len(df)):
    if df['monthsretained'][i] >= 5:
        active6.append(1)
    else:
        active6.append(0)

In [None]:
df['active6'] = active6

Our non-numeric features are 'city', 'signup_date', 'last_trip_date', and 'phone'. 'city' and 'phone' are nominal data features, while the date features are ordinal. 

Now we're going to factorize our non-numeric features so that we can look for any potential correlations there as well.

In [None]:
dates = np.unique(df['last_trip_date'])

In [None]:
signupdatesF = []
lasttripdatesF = []
value1 = 0
value2 = 0

for i in range(len(df)):
    signupdate = df['signup_date'][i]
    lasttripdate = df['last_trip_date'][i]
    
    for j in range(len(dates)):
        if signupdate == dates[j]:
            value1 = j + 1
        if lasttripdate == dates[j]:
            value2 = j + 1
    
    signupdatesF.append(value1)
    lasttripdatesF.append(value2)

In [None]:
cityF = pd.get_dummies(df['city'])
phoneF = pd.get_dummies(df['phone'])

In [None]:
df['signupdatesF'] = signupdatesF
df['lasttripdatesF'] = lasttripdatesF



In [None]:
dfEDA = pd.concat([df, pd.get_dummies(df['city'])], axis=1)
dfEDA = pd.concat([dfEDA, pd.get_dummies(df['phone'])], axis=1)

In [None]:
dfEDA.corr()

Looking at features that correlate the most strongly with 'active6', we have 'trips_in_first_30_days', 'ultimate_black_user', 'weekday_pct', 'avg_dist', 'city', 'monthsretained', 'daysretained', and 'last_trip_date'. However we can't use any of 'monthsretained', 'daysretained', and 'last_trip_date' because 'active6' is derived from these columns. 

We'll the remaining features as our features of interest. 

In [None]:
from sklearn.model_selection import train_test_split

X = dfEDA[['trips_in_first_30_days', 'ultimate_black_user', 'weekday_pct', 'avg_dist', 'Astapor', 'King\'s Landing']]

y = df['active6']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = y)

# Modeling

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score

accuracy_score(y_test, y_pred)

In [None]:
# Get and reshape confusion matrix data
confusion_matrix(y_test, y_pred)

In [None]:
from matplotlib.ticker import IndexLocator
import itertools

def plot_cm(y_test,y_pred_class,classes=['NON-default','DEFAULT']):
    # plot confusion matrix
    fig, ax = plt.subplots()
    cm = confusion_matrix(y_test, y_pred_class)
    
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    plt.title("Confusion Matrix")
    ax.set(yticks=[-0.5, 1.5], 
           xticks=[0, 1], 
           yticklabels=classes, 
           xticklabels=classes)
    ax.yaxis.set_major_locator(IndexLocator(base=1, offset=0.5))
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_cm(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import KFold

def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [None]:
clf = LogisticRegression()
score = cv_score(clf, X_train, y_train)
print(score)

In [None]:
#the grid of parameters to search over
Cs = [0.001, 0.1, 1, 10, 100]

# your turn

scores = [0] * 5
index = 0

for cvalue in Cs:

    clf = LogisticRegression(C=cvalue)
    scores[index] = cv_score(clf, X_train, y_train)
    index = index + 1


print(scores)

# It seems like 0.1, 1, and 100 are all tied with each other.

In [None]:
def scores(y_test, y_pred_class):
    # Prints formatted classification metrics. 
    print('Classification Accuracy: ', format(accuracy_score(y_test, y_pred_class), '.3f'))
    print('Precision score: ', format(precision_score(y_test, y_pred_class), '.3f'))
    print('Recall score: ', format(recall_score(y_test, y_pred_class), '.3f'))
    print('F1 score: ', format(f1_score(y_test, y_pred_class), '.3f'))

In [None]:
import xgboost as xgb
from sklearn.svm import SVC

def logiRegr(X_train, y_train, X_test, y_test,**kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    logreg = LogisticRegression(random_state=1,**kwargs)
    # Fit to training data.
    logreg.fit(X_train, y_train)
    # Examine coefficients
    pprint.pprint(list(zip(X_train.columns,logreg.coef_[0])))
    # Class predictions (not predicted probabilities)
    y_pred_class = logreg.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Plot confusion matrix
    plot_cm(y_test,y_pred_class)
    
def randomForest(X_train, y_train, X_test, y_test,**kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    rf = RandomForestClassifier(random_state=1, **kwargs) 
    # Fit to training data.
    rf.fit(X_train,y_train)
    # Class predictions
    y_pred_class = rf.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Confusion matrix
    plot_cm(y_test,y_pred_class)
    
def xgbClass(X_train, y_train, X_test, y_test,**kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    xg = xgb.XGBClassifier(seed=1,**kwargs)
    # Fit to training data.
    xg.fit(X_train,y_train)
    # Class predictions
    y_pred_class = xg.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Confusion matrix
    plot_cm(y_test,y_pred_class)
    
def svmClass(X_train, y_train, X_test, y_test, **kwargs):
    # Instantiate model. Use kwargs to pass parameters.
    # Pass GridSearch best_params with ** to unpack.
    svm = SVC(random_state=1,**kwargs)
    # Fit to training data.
    svm.fit(X_train, y_train)
    # Class predictions
    y_pred_class = svm.predict(X_test)
    # Scoring metrics
    scores(y_test, y_pred_class)
    # Plot confusion matrix
    plot_cm(y_test,y_pred_class)

In [None]:
logiRegr(X_train, y_train, X_test, y_test)

In [None]:
randomForest(X_train, y_train, X_test, y_test)

In [None]:
xgbClass(X_train, y_train, X_test, y_test)