In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score


In [2]:
users =pd.read_csv('takehome_users.csv', encoding = 'latin')
user_engagement = pd.read_csv('takehome_user_engagement.csv')


In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [6]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


I will use the engagement data to identify users who log on at least 3 times in a seven day period. Then using those user_id's, I will use the main dataframe to identify features that predict if the user will be active. Initial thoughts are that creation_source, org_id, mailing_list, market_drip, and perhaps if they were invited (not by whom, just if they were) could be good predictors.

In [7]:
# Parse the unix time into readable datetime
#users.last_session_creation_time = pd.to_datetime(users.last_session_creation_time, unit='s')
#users.last_session_creation_time[1]

In [8]:
user_engagement["time_stamp"] = pd.to_datetime(user_engagement["time_stamp"])
user_engagement["time_stamp"][1]

Timestamp('2013-11-15 03:45:04')

In [9]:
user_engagement = user_engagement.set_index("time_stamp")
#setting timestamp as index

In [10]:
#Creating dataframe to count number of times each user accessed website within 1 week period
user_engagement_weekly_count = user_engagement.groupby(['user_id', pd.Grouper(freq='7D')]).sum()

#Filtering df to get users with 3 or more logins in a week -- these are the "adopted users"
user_engagement_weekly_count_fltr = user_engagement_weekly_count.loc[user_engagement_weekly_count["visited"]>=3, :]

#Extracting unique list of "adopted users"
user_engagement_weekly_count_fltr = user_engagement_weekly_count_fltr.reset_index()

adopted_users = user_engagement_weekly_count_fltr["user_id"].unique()

#Making a dataframe containing all adopted users
adopted_users_df = pd.DataFrame({"user_id":adopted_users, "user_adoption": 'yes'})
print("This is a df of adopted users:")
display(adopted_users_df.head())

This is a df of adopted users:


Unnamed: 0,user_adoption,user_id
0,yes,2
1,yes,10
2,yes,42
3,yes,43
4,yes,53


In [11]:
#3) Joining "adopted users" with "user" dataframe
#a) Performing 'outer join'
combined_df = pd.merge(users,adopted_users_df, left_on ='object_id', right_on = "user_id", how='outer')

#b) Filling nan's in 'user_adoption' column with 'no'
combined_df["user_adoption"].fillna("no", inplace=True)


#c) dropping 'user_id' column
combined_df = combined_df.drop("user_id", axis=1)
display(combined_df.head())

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_adoption
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,no
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,yes
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,no
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,no
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,no


In [12]:
#4) Determining most important features to predict if a user will be an 'adopted user' or not

#a) Dropping unnecessary columns
combined_df_model = combined_df.drop(["creation_time", 'name', 'email'], axis=1) #email & names could be 

#b) dropping all rows with nans
combined_df_model_d= combined_df_model.dropna()
# display(combined_df_model_d)

#c) dummifying categorical columns
combined_df_model_w_dum = pd.get_dummies(combined_df_model_d, drop_first=True)
combined_df_model_w_dum.head()

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,creation_source_ORG_INVITE,user_adoption_yes
0,1,1398139000.0,1,0,11,10803.0,0,0
1,2,1396238000.0,0,0,1,316.0,1,1
2,3,1363735000.0,0,0,94,1525.0,1,0
3,4,1369210000.0,0,0,1,5151.0,0,0
4,5,1358850000.0,0,0,193,5240.0,0,0


In [13]:
#5) Feature Selection with Recursive Feature Elimination (RFE)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


#a) Selectinf features and target variable
X= combined_df_model_w_dum.drop("user_adoption_yes", axis=1)
Y = combined_df_model_w_dum["user_adoption_yes"]

#b) Instantiating model & use RFE
model_logreg = LogisticRegression()
#RFE
rfe = RFE(model_logreg, 3)#calling RFE with model and number of best features we would like to keep

rfe = rfe.fit(X, Y) #fitting rfe with features and target variable

features_selected = rfe.support_ #indicates whether feature is selected or not (True or False)
features_selected_rank = rfe.ranking_ #gives ranking of feature (lower is better)


#Finding column names of features selected
orig_columns = X.columns.values #orignal column names in list form
#print(orig_columns)

selected_column_names = np.array(orig_columns) * features_selected

#cleaning up list of selected column names, removing empty strings
final_selected_column_names = [x for x in selected_column_names if len(x)>1]
print("These are the Features that have been selected via RFE:\n\n", final_selected_column_names)

These are the Features that have been selected via RFE:

 ['object_id', 'last_session_creation_time', 'invited_by_user_id']


The model has determined that object_id, last_session_creation_time, and invited_by_user_id, are indeed the top determining factors for if a user will remain a user.