In [124]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, roc_auc_score

In [12]:
user_data = pd.read_csv("takehome_users.csv", encoding='ISO-8859-1')
user_data.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [17]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [16]:
visits_data = pd.read_csv("takehome_user_engagement.csv")
visits_data.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [18]:
visits_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [19]:
visits_data["time_stamp"] = pd.to_datetime(visits_data["time_stamp"])
visits_data = visits_data.sort_values(by = ["user_id", "time_stamp"])

In [41]:
def check_adoption(df):
    df = df.sort_values()
    for i in range(len(df) - 2):
        if (df.iloc[i + 2] - df.iloc[i]).days <= 7:
            return 1
    return 0

In [42]:
adoption = visits_data.groupby("user_id")["time_stamp"].apply(check_adoption).reset_index()
adoption.columns = ["user_id", "adopted_label"]

In [44]:
adoption.head()

Unnamed: 0,user_id,adopted_label
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0


In [84]:
user_data_labeled = pd.merge(user_data, adoption, how = "left", left_on = "object_id", right_on = "user_id")
user_data_labeled.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,adopted_label
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1.0,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,2.0,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,3.0,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,4.0,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,5.0,0.0


In [85]:
user_data_labeled.drop(columns = ["last_session_creation_time", "user_id"], inplace = True)

In [86]:
user_data_labeled.adopted_label.value_counts()

adopted_label
0.0    7167
1.0    1656
Name: count, dtype: int64

In [87]:
condition = (user_data_labeled.adopted_label == 0) | (user_data_labeled.adopted_label == 1)
user_data_labeled = user_data_labeled[condition]

In [88]:
user_data_labeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   8823 non-null   int64  
 1   creation_time               8823 non-null   object 
 2   name                        8823 non-null   object 
 3   email                       8823 non-null   object 
 4   creation_source             8823 non-null   object 
 5   opted_in_to_mailing_list    8823 non-null   int64  
 6   enabled_for_marketing_drip  8823 non-null   int64  
 7   org_id                      8823 non-null   int64  
 8   invited_by_user_id          4776 non-null   float64
 9   adopted_label               8823 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 758.2+ KB


In [89]:
user_data_labeled["invited_by_user_id"] = user_data_labeled["invited_by_user_id"].apply(lambda x: 1 if x > 0 else 0)

In [90]:
user_data_labeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   8823 non-null   int64  
 1   creation_time               8823 non-null   object 
 2   name                        8823 non-null   object 
 3   email                       8823 non-null   object 
 4   creation_source             8823 non-null   object 
 5   opted_in_to_mailing_list    8823 non-null   int64  
 6   enabled_for_marketing_drip  8823 non-null   int64  
 7   org_id                      8823 non-null   int64  
 8   invited_by_user_id          8823 non-null   int64  
 9   adopted_label               8823 non-null   float64
dtypes: float64(1), int64(5), object(4)
memory usage: 758.2+ KB


In [91]:
user_data_labeled.invited_by_user_id.value_counts()

invited_by_user_id
1    4776
0    4047
Name: count, dtype: int64

In [99]:
user_data_labeled["creation_time"] = pd.to_datetime(user_data_labeled["creation_time"])
user_data_labeled["creation_year"] = user_data_labeled["creation_time"].dt.year
user_data_labeled["creation_month"] = user_data_labeled["creation_time"].dt.month
user_data_labeled["creation_day"] = user_data_labeled["creation_time"].dt.day
user_data_labeled["creation_weekday"] = user_data_labeled["creation_time"].dt.weekday
user_data_labeled["creation_hour"] = user_data_labeled["creation_time"].dt.hour

In [103]:
user_data_labeled.drop(columns = ["object_id", "creation_time", "name", "email"], inplace = True)

In [104]:
model_data = pd.get_dummies(user_data_labeled)

In [106]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 15 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   opted_in_to_mailing_list            8823 non-null   int64  
 1   enabled_for_marketing_drip          8823 non-null   int64  
 2   org_id                              8823 non-null   int64  
 3   invited_by_user_id                  8823 non-null   int64  
 4   adopted_label                       8823 non-null   float64
 5   creation_year                       8823 non-null   int32  
 6   creation_month                      8823 non-null   int32  
 7   creation_day                        8823 non-null   int32  
 8   creation_weekday                    8823 non-null   int32  
 9   creation_hour                       8823 non-null   int32  
 10  creation_source_GUEST_INVITE        8823 non-null   bool   
 11  creation_source_ORG_INVITE          8823 non-nu

# Modelling

In [107]:
model_data.columns

Index(['opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id',
       'invited_by_user_id', 'adopted_label', 'creation_year',
       'creation_month', 'creation_day', 'creation_weekday', 'creation_hour',
       'creation_source_GUEST_INVITE', 'creation_source_ORG_INVITE',
       'creation_source_PERSONAL_PROJECTS', 'creation_source_SIGNUP',
       'creation_source_SIGNUP_GOOGLE_AUTH'],
      dtype='object')

In [111]:
X = model_data.drop(columns = ["adopted_label"])
y = model_data["adopted_label"]

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 19)

In [113]:
clf = RandomForestClassifier(n_estimators = 100, max_depth = 6)
clf.fit(X_train, y_train)

In [117]:
predictions = clf.predict(X_test)

In [122]:
print(f"Accuracy Score: {accuracy_score(y_test, predictions):.2%}")
print(f"AUC ROC: {roc_auc_score(y_test, predictions):.2f}")

Accuracy Score: 80.83%
AUC ROC: 0.50


In [127]:
importances = clf.feature_importances_
columns = X_train.columns

In [134]:
df = pd.DataFrame({
    'Feature': columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

In [137]:
df

Unnamed: 0,Feature,Importance
0,org_id,0.256136
1,creation_month,0.177545
2,creation_day,0.119254
3,creation_year,0.117909
4,creation_hour,0.116263
5,creation_weekday,0.056699
6,creation_source_GUEST_INVITE,0.040636
7,creation_source_SIGNUP,0.022489
8,opted_in_to_mailing_list,0.020963
9,enabled_for_marketing_drip,0.016238


## Final Answer

org_id (0.256136): This feature has the highest importance score, contributing over 25% to the model's predictive power. This suggests that the organization ID plays a significant role in determining the adoption. It could indicate strong differences between how users behave across different organizations.

creation_month (0.177545): The second most important feature. The month in which something was created seems to have a notable impact, possibly indicating seasonal trends or business cycles.

creation_day (0.119254) and creation_year (0.117909): These two features also have a high importance, showing that both the specific day and year of creation significantly affect the outcome. This could be related to changes over time, possibly reflecting when a service or product was more or less popular or operational changes within certain periods.

creation_hour (0.116263): The time of day when something was created also has an impact, albeit slightly lower than the date-related features. This might reflect user behavior patterns that vary depending on the time of day (e.g., users being more active at certain hours).