In [786]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [787]:
df_sessions = pd.read_csv('takehome_user_engagement.csv')

In [788]:
df_users = pd.read_csv('takehome_users.csv')

In [789]:
df_sessions.head(10).tail(5).style.hide_index()

time_stamp,user_id,visited
2013-12-31 03:45:04,2,1
2014-01-08 03:45:04,2,1
2014-02-03 03:45:04,2,1
2014-02-08 03:45:04,2,1
2014-02-09 03:45:04,2,1


In [790]:
df_sessions.describe()

Unnamed: 0,user_id,visited
count,207917.0,207917.0
mean,5913.314197,1.0
std,3394.941674,0.0
min,1.0,1.0
25%,3087.0,1.0
50%,5682.0,1.0
75%,8944.0,1.0
max,12000.0,1.0


In [791]:
df_sessions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


Making a modification to the dataframe for sessions:

In [792]:
df_sessions['time_stamp'] = pd.to_datetime(df_sessions['time_stamp'])
df_sessions['time_stamp'].head(0)

Series([], Name: time_stamp, dtype: datetime64[ns])

In [793]:
df_users.tail().style.hide_index()

object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1378448055.0,0,0,89,8263.0
11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1358274517.0,0,0,200,
11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1398602716.0,1,1,83,8074.0
11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1338638159.0,0,0,6,
12000,2014-01-26 08:57:12,Lima Tha�s,ThaisMeloLima@hotmail.com,SIGNUP,1390726632.0,0,1,0,


In [794]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Renaming a particular column for the users provides more uniformity of terminology:

In [795]:
df_users.rename(columns = {'object_id':'user_id'}, inplace = True)

Now to replace the column for 'creation_time' with a corresponding series that is more compatible with machine learning models:

In [796]:
df_users['creation_time'] = pd.to_datetime(df_users['creation_time'])
df_users['creation_time'].head(0)

Series([], Name: creation_time, dtype: datetime64[ns])

In [797]:
time_max = max(np.array([df_sessions['time_stamp'].max(), df_users['creation_time'].max()]))

The cell above calculates the most recent datetime between the two dataframes.  How long before this point in time would be a good way to express the user's account creation time.

In [798]:
df_users['creation_time'] = time_max - df_users['creation_time']
df_users['creation_time'] = [df_users['creation_time'][i].total_seconds() for i in df_users.index]
df_users['creation_time'].head()

0     3927920.0
1    17579626.0
2    38331838.0
3    32942962.0
4    43649070.0
Name: creation_time, dtype: float64

Looking further into the users' dataframe:

In [799]:
df_users['creation_time'][0], df_users['creation_time'][1]

(3927920.0, 17579626.0)

In [800]:
df_users['creation_source'].unique()

array(['GUEST_INVITE', 'ORG_INVITE', 'SIGNUP', 'PERSONAL_PROJECTS',
       'SIGNUP_GOOGLE_AUTH'], dtype=object)

One hot encode the column for creation source:

In [801]:
dummies = pd.get_dummies(df_users['creation_source'])
df_users[dummies.columns] = dummies
df_users.drop('creation_source', axis=1, inplace=True)
df_users.tail().style.hide_index()

user_id,creation_time,name,email,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
11996,23618675.0,Meier Sophia,SophiaMeier@gustr.com,1378448055.0,0,0,89,8263.0,0,1,0,0,0
11997,44224213.0,Fisher Amelie,AmelieFisher@gmail.com,1358274517.0,0,0,200,,0,0,0,0,1
11998,3464014.0,Haynes Jake,JakeHaynes@cuvox.de,1398602716.0,1,1,83,8074.0,1,0,0,0,0
11999,63601371.0,Faber Annett,mhaerzxp@iuxiw.com,1338638159.0,0,0,6,,0,0,1,0,0
12000,11340098.0,Lima Tha�s,ThaisMeloLima@hotmail.com,1390726632.0,0,1,0,,0,0,0,1,0


'df_users['last_session_creation_time']' is a redundant column.

In [802]:
df_users.drop('last_session_creation_time', axis=1, inplace=True)

The dataframe for users has a column called 'org_id'.  Is it needed?

In [803]:
len(df_users['org_id'].unique())

417

For this particular assignment, it is not practical to differentiate between this many organizational ID's of account holders.

In [804]:
df_users.drop('org_id', axis=1, inplace=True)

Conserning the column labeled 'invited_by_user_id', we don't need to know who did the inviting, just whether or not the account holder was invited by someone or some organization.  This has already been handled by the one hot encoding of 'creation_source'.

In [805]:
df_users.drop('invited_by_user_id', axis=1, inplace=True)
df_users.columns

Index(['user_id', 'creation_time', 'name', 'email', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'GUEST_INVITE', 'ORG_INVITE',
       'PERSONAL_PROJECTS', 'SIGNUP', 'SIGNUP_GOOGLE_AUTH'],
      dtype='object')

Now, an adjustment to 'df_sessions' to create a column for the number of days prior to the latest timestamp:

In [806]:
df_sessions['time difference'] = df_sessions['time_stamp'].max() - df_sessions['time_stamp']
df_sessions['time difference'] = [df_sessions['time difference'][i].days for i in df_sessions.index]

In [807]:
df_sessions['time difference'].head()

0     45
1    203
2    189
3    179
4    163
Name: time difference, dtype: int64

Here is a list comprehension of session ages by user, (making it a list of lists).  We are only interested in sessions that are no more than 7 days apart and that belong to users with at least three sessions.  For this reason, users with too few sessions will be given three sessions that are 8 days apart to set them up to be discarded.

In [808]:
session_ages_by_user = [df_sessions[df_sessions['user_id']==i]['time difference'] if \
                      len (df_sessions[df_sessions['user_id']==i].index) >=3 else [8, 0, 0]\
                      for i in df_users.index]

The instructions for this assignment describe frequent users as 'adoptive user'(s) which is awkward terminology.  The phrase used here in its place will be 'heavy user'.

In [809]:
df_users['heavy user'] = [[pd.Series(int(list(ages)[i-2] - list(ages)[i] <= 7) for i in range(2, len(ages))).max()][0] \
                          for ages in (session_ages_by_user)]

Now generating X and y for machine learning:

In [810]:
X = df_users.drop(['user_id', 'name', 'email', 'heavy user'], axis=1)
y = df_users['heavy user']

The instructions say to 'identify  which  factors  predict  future  user adoption', the criteria of which are addressed just above the first cell with 'df_users['heavy user']'.  They also recommend spending 1-2 hours with a suggestion of spending even less time.  With this in mind, a scalor object needs to be applied to X so that a simple logistic regression can solve the problem with a review of its coefficients.  Anything beyond that would be overkill for this task.

In [811]:
weights = LogisticRegression().fit(StandardScaler().fit_transform(X),y).coef_
weights

array([[ 0.0032669 , -0.03798208,  0.05582987,  0.03100581, -0.01084153,
        -0.04339821,  0.00568179,  0.02390399]])

Here are the contributing factors:

In [812]:
X_features = df_users.drop(['user_id', 'name', 'email', 'heavy user'], axis=1).columns
list(X_features)

['creation_time',
 'opted_in_to_mailing_list',
 'enabled_for_marketing_drip',
 'GUEST_INVITE',
 'ORG_INVITE',
 'PERSONAL_PROJECTS',
 'SIGNUP',
 'SIGNUP_GOOGLE_AUTH']

Now, to store the 'weights' in a simpler data structure:

In [813]:
list_of_weights = list((list(weights))[0])

Here are the features that are most predictive of heavy use occurring at least once and of heavy use never happening, respectively:

In [814]:
X_features[list_of_weights.index(np.max(weights))], X_features[list_of_weights.index(np.min(weights))]

('enabled_for_marketing_drip', 'PERSONAL_PROJECTS')

Here is a list going from worst to first of the contributing factors based on their impact on heavy use:

In [815]:
[X_features[list_of_weights.index(weight)] for weight in np.sort(np.array(list_of_weights))]

['PERSONAL_PROJECTS',
 'opted_in_to_mailing_list',
 'ORG_INVITE',
 'creation_time',
 'SIGNUP',
 'SIGNUP_GOOGLE_AUTH',
 'GUEST_INVITE',
 'enabled_for_marketing_drip']

Here is a list of factors that predict that there will not be heavy use in ascending order of 'damage' (going from 'mild to worse'):

In [816]:
[X_features[list_of_weights.index(weight)] for weight in -np.sort(-pd.DataFrame(list_of_weights)\
                                                                 [pd.DataFrame(list_of_weights)[0]<0][0].values)]

['ORG_INVITE', 'opted_in_to_mailing_list', 'PERSONAL_PROJECTS']

Here is a list of factors that are predictive of heavy use if not neutral.  These are in ascending order of favorability.

In [817]:
[X_features[list_of_weights.index(weight)] for weight in np.sort(pd.DataFrame(list_of_weights)\
                                                                 [pd.DataFrame(list_of_weights)[0]>=0][0].values)]

['creation_time',
 'SIGNUP',
 'SIGNUP_GOOGLE_AUTH',
 'GUEST_INVITE',
 'enabled_for_marketing_drip']