# Relax Inc. Take Home Challenge

Report prepard by Ben Chamblee: https://github.com/Bench-amblee 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from datetime import datetime

#modeling
from sklearn.preprocessing import  LabelEncoder
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef as MCC

users = pd.read_csv('takehome_users.csv', encoding = 'ISO-8859-1')
engagement = pd.read_csv('takehome_user_engagement.csv')

In [2]:
#inspect the data
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


Since we have no way of guessing or predicting what NaN values are, we'll just fill them in with 0

In [4]:
users = users.fillna(0)

In [5]:
# convert timestamp to datetime
users.last_session_creation_time = users.last_session_creation_time.map(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
users.last_session_creation_time = pd.to_datetime(users.last_session_creation_time)
users.creation_time = pd.to_datetime(users.creation_time)

In [6]:
#we can determine last log time by subtracting creation and last session

last_log_days = users.last_session_creation_time - users.creation_time

days = []
for i in last_log_days:
    val = i.days
    if val < 0:
        days.append(0)
    else:
        days.append(val)

users['last_log_days'] = days

In [7]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [8]:
# convert time_stamp to datetime
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])

In [9]:
#this function gets a count of each group

def get_count(group, freq):
    return group.rolling(freq, on='time_stamp')['user_id'].count()

engagement['visits_7_days'] = engagement.groupby('user_id', as_index=False, group_keys=False).apply(get_count, '7D')

In [10]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited,visits_7_days
0,2014-04-22 03:53:30,1,1,1.0
1,2013-11-15 03:45:04,2,1,1.0
2,2013-11-29 03:45:04,2,1,1.0
3,2013-12-09 03:45:04,2,1,1.0
4,2013-12-25 03:45:04,2,1,1.0


In [11]:
# we need to create a list that shows when the user_id is present in
# engagement, called 'adopted_user'

engagement_counts = []

for index, row in engagement.iterrows():
    if row.visits_7_days >= 3.0:
        if row.user_id not in engagement_counts:
            engagement_counts.append(row.user_id)

adopted_user = []

for i in users.object_id:
    if i in engagement_counts:
        adopted_user.append(1)
    else:
        adopted_user.append(0)
        
users['adopted_user'] = adopted_user

users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,last_log_days,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-21 22:53:30,1,0,11,10803.0,0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-30 22:45:04,0,0,1,316.0,135,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 18:14:52,0,0,94,1525.0,0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 03:09:28,0,0,1,5151.0,0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 04:14:20,0,0,193,5240.0,4,0


In [12]:
print('Number of adopted_users: ' + str((users.adopted_user == 1).sum()))


Number of adopted_users: 1602


In [13]:
#create an encoder for source column
LE = LabelEncoder()
users['creation_source_code'] = LE.fit_transform(users.creation_source)

In [14]:
#define our variables
y = users.adopted_user
final_data = users.drop(['object_id', 'name', 'email', 'creation_source', 'creation_time', 'last_session_creation_time', 'adopted_user'], axis=1)
final_cols = final_data.columns

X = minmax_scale(final_data)

In [15]:
#create train test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)

In [16]:
# first model attempt

svm = SVC(kernel='linear', random_state=10, class_weight='balanced')

svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)

In [17]:
#first prediction
y_trainpredSVM = svm.predict(X_train)

print(classification_report(y_train, y_trainpredSVM))
print(MCC(y_train, y_trainpredSVM))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      7279
           1       0.83      0.94      0.88      1121

    accuracy                           0.97      8400
   macro avg       0.91      0.96      0.93      8400
weighted avg       0.97      0.97      0.97      8400

0.8678672490115139


In [18]:
#same but for test set
ypredSVM = svm.predict(X_test)

print(classification_report(y_test, ypredSVM))
print(MCC(y_test, ypredSVM))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      3119
           1       0.81      0.96      0.88       481

    accuracy                           0.96      3600
   macro avg       0.90      0.96      0.93      3600
weighted avg       0.97      0.96      0.97      3600

0.8606137026688333


This is a good start, but we can improve this by taking feature importance into account

In [19]:
# feature importance
pd.Series(svm.coef_[0], index=final_cols)

opted_in_to_mailing_list       0.007971
enabled_for_marketing_drip    -0.011366
org_id                         0.057643
invited_by_user_id            -0.001985
last_log_days                 18.933494
creation_source_code          -0.010192
dtype: float64

Looks like last_log_days is the most important feature by far. Removing that would only make our models worse.

With this in mind I can safely determine that last_log_days is one of, if not the most important factor when predicting user adoption.