In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


In [2]:
users_en = pd.read_csv("C:\\Users\\hanna\\Downloads\\1481069814_relax_challenge\\relax_challenge\\takehome_user_engagement.csv")
users = pd.read_csv("C:\\Users\\hanna\\Downloads\\1481069814_relax_challenge\\relax_challenge\\takehome_users.csv", encoding="latin")

In [3]:
users_en.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [4]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


We see missing values in "last_session_creation_time" and "invited_by_user_id".

In [6]:
users_en.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


The common features of the two DataFrames are "Object_id" & "User_id". We need to groupby the user id to see how many times a person logged in and keep the ones where the user logged in at least 3 times in a 7 day cycle. However, first we would need to change the time columns from object to 'datetime64'. 

In [7]:
# Change Object_id to User_id in uses
users.rename(columns = {'object_id':'user_id'}, inplace = True)


#changing the date time format
users_en.time_stamp = users_en.time_stamp.astype('datetime64')
users.creation_time = users.creation_time.astype('datetime64')
users.last_session_creation_time = pd.to_datetime(users.last_session_creation_time, unit='s')

In [8]:
users.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0


In [9]:
users_en.head(10)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


Defining  an  "adopted  user"   as  a  user  who   has  logged  into  the  product  on  three  separate
days  in  at  least  one  seven­day  period ,  identify  which  factors  predict  future  user
adoption. We can groupby the user id and apply the weekly rolling occurences and keep ones where the adopted user logged in at least 3 times a week. 

In [10]:
def rolling(data, time):
    return data.rolling(time, on='time_stamp')['user_id'].count()

users_en['adopted_users'] = users_en.groupby('user_id', as_index = False, group_keys = False).apply(rolling, '7D')
users_en

Unnamed: 0,time_stamp,user_id,visited,adopted_users
0,2014-04-22 03:53:30,1,1,1.0
1,2013-11-15 03:45:04,2,1,1.0
2,2013-11-29 03:45:04,2,1,1.0
3,2013-12-09 03:45:04,2,1,1.0
4,2013-12-25 03:45:04,2,1,1.0
...,...,...,...,...
207912,2013-09-06 06:14:15,11996,1,1.0
207913,2013-01-15 18:28:37,11997,1,1.0
207914,2014-04-27 12:45:16,11998,1,1.0
207915,2012-06-02 11:55:59,11999,1,1.0


Now we create the adopted user column with respect to the question of the notebook.

In [11]:
adopted_count = users_en.groupby('user_id')[['adopted_users']].max()
adopted_count['log_in_count'] = adopted_count['adopted_users'].astype(int)
adopted_count.drop('adopted_users', axis = 1, inplace = True)


adopted_count.loc[adopted_count['log_in_count'] >= 3,'adopted_user'] = 1
adopted_count.loc[adopted_count['log_in_count'] < 3,'adopted_user'] = 0

adopted_count['adopted_user'] = adopted_count['adopted_user'].astype(int)

adopted_count.head(20)


Unnamed: 0_level_0,log_in_count,adopted_user
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,3,1
3,1,0
4,1,0
5,1,0
6,1,0
7,1,0
10,7,1
11,1,0
13,1,0


Joining the adopted count with the users Dataframe.

In [12]:
users_merged = users.join(adopted_count, on = 'user_id', how='left')


Before we move on to creating our training and test set, we can just have a column where we see if a person has been invited by user id or not, to do that, we will use 'invited_by_user_id' column to create a 0 where the row is empty and 1 if it not (invited by someone else).

In [13]:
situation = lambda row: 0 if np.isnan(row) else 1
users_merged["invited_by_user"] = users_merged["invited_by_user_id"].apply(situation)

users_merged.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,log_in_count,adopted_user,invited_by_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,1.0,0.0,1
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,3.0,1.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,1.0,0.0,1
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,1.0,0.0,1
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,1.0,0.0,1


Now let us pick out the columns for our training and test set, we will then get dummies and create a model to test its performance and do feature extraction. 

In [14]:
df = users_merged[["creation_source", "opted_in_to_mailing_list", "enabled_for_marketing_drip", \
                             "adopted_user", "invited_by_user"]]

df['adopted_user'] = df['adopted_user'].fillna(0)

df['adopted_user'] = df.adopted_user.astype(int)

df = pd.get_dummies(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['adopted_user'] = df['adopted_user'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['adopted_user'] = df.adopted_user.astype(int)


In [15]:
df.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,adopted_user,invited_by_user,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1,0,0,1,1,0,0,0,0
1,0,0,1,1,0,1,0,0,0
2,0,0,0,1,0,1,0,0,0
3,0,0,0,1,1,0,0,0,0
4,0,0,0,1,1,0,0,0,0


In [16]:
X = df.drop(['adopted_user'], axis = 1)
y = df['adopted_user']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.7, random_state = 2022)

In [17]:
print(y_train.value_counts(normalize = True))
print(y_test.value_counts(normalize = True))

0    0.866389
1    0.133611
Name: adopted_user, dtype: float64
0    0.866548
1    0.133452
Name: adopted_user, dtype: float64


Next up we will create a pipeline to do one hote encoding and run the RandomForestClassifier, and set a params grid. Then we will use Grid Search to run 3 folds, fit it on train data, and check for best parameters. 

In [21]:
import joblib as jb
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 10)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("Done")



X_smote.shape, y_smote.shape

Done


((6238, 8), (6238,))

In [40]:
rf = RandomForestClassifier()      
rf = rf.fit(X_smote, y_smote)

values = rf.feature_importances_
ind = np.argsort(values)[::-1]

feature = []
importance = list()
name = []

# Print the feature ranking
print("Feature ranking:")
for i in range(X_train.shape[1]):
#     print(f"{i+1}. feature #{ind[i]} ({values[ind[i]]})")
    feature.append(ind[i])
    importance.append(values[ind[i]])
    name.append(df.columns[ind[i]])
    
# Creating a DF with most important features    
feature_importance = pd.DataFrame({'feature': feature, 'importance': importance, 'name': name})
feature_importance.reset_index(drop=True, inplace=True)

feature_importance = pd.DataFrame(feature_importance.sort_values('importance', ascending = False).head(11))


print(feature_importance)

Feature ranking:
   feature  importance                               name
0        5    0.397056         creation_source_ORG_INVITE
1        0    0.161481           opted_in_to_mailing_list
2        7    0.138760             creation_source_SIGNUP
3        1    0.111946         enabled_for_marketing_drip
4        6    0.062040  creation_source_PERSONAL_PROJECTS
5        2    0.048348                       adopted_user
6        3    0.043829                    invited_by_user
7        4    0.036541       creation_source_GUEST_INVITE


After cleaning the data and extracting the features we want our model to be built upon, we see that original invites and being the mailing list rank highest, it is better to carry on with these traidional methods or marketing and mailing. 
