In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None #so that when I slice the data, it doesn't give a warning 
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

**Using user engagement dataset to find adopted user**

In [2]:
# using user engagement dataset to find adopted user for every week
df = pd.read_csv('...takehome_user_engagement.csv')

In [3]:
df.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [4]:
#changing date to datetime
df['time_stamp'] = pd.to_datetime(df['time_stamp'])

In [5]:
df_sorted = df.sort_values(by='time_stamp')

In [6]:
df_sorted.head()

Unnamed: 0,time_stamp,user_id,visited
178140,2012-05-31 08:20:06,10012,1
59486,2012-05-31 15:47:36,3428,1
175638,2012-05-31 17:19:37,9899,1
26821,2012-05-31 21:58:33,1693,1
109716,2012-06-01 00:17:30,6102,1


In [436]:
# Here the visited column will be grouped by user_id and resampled as weekly.
#In order to make 2012-05-31 as the first day of the week, which is a thursday, I give W-WED
df_1 = df_sorted.set_index('time_stamp').groupby('user_id')['visited'].resample('W-WED').sum()

In [448]:
# resetting index again and sort by timestamp
df_2 = df_1.reset_index()
df_3 = df_2.sort_values(by='time_stamp')
df_3.head()

Unnamed: 0,user_id,time_stamp,visited
23477,3428,2012-06-06,1
77738,11297,2012-06-06,1
61628,8991,2012-06-06,1
46122,6645,2012-06-06,1
13836,2066,2012-06-06,1


In [450]:
# If in any given week if the total visited is atleast 3, then the user is an adopted user
# creating new variable adopted user
df_3.loc[(df_3['visited'] >= 3), 'adopted_user'] = 1
df_3.loc[(df_3['visited'] < 3), 'adopted_user'] = 0

In [451]:
# This will sort values based both on user_id and adopted user; adopted user high values like (1) will come first
df_4 = df_3.sort_values(['user_id', 'adopted_user'], ascending = [True, False])
df_4.head(5)

Unnamed: 0,user_id,time_stamp,visited,adopted_user
0,1,2014-04-23,1,0.0
1,2,2013-11-20,1,0.0
2,2,2013-11-27,0,0.0
3,2,2013-12-04,1,0.0
4,2,2013-12-11,1,0.0


In [443]:
#Finding unique user_ids with adoption (by keep=first, we can get the higher value like 1 for adopted user if there's any)
df_5 = df_4.drop_duplicates('user_id',keep='first')
df_5.head(5)


In [452]:
#to find how many adopted users are there
df_5['adopted_user'].value_counts()

0.0    7378
1.0    1445
Name: adopted_user, dtype: int64

**Merge the adopted user data with the user details data**

In [453]:
#read user data
user = pd.read_csv('...takehome_users.csv', engine='python')

In [454]:
user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [233]:
#rename object_id as user_id to match with the index in adopted-user data
user = user.rename(columns= {'object_id':'user_id'})

In [234]:
user.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [235]:
# Merge this with the user_engagement data using left join (adopted_user data (df_5) will be the left table)
df_all = df_5.merge(user, on='user_id', how='left')

In [236]:
df_all.head()

Unnamed: 0,user_id,time_stamp,visited,adopted_user,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-23,1,0.0,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-20,1,0.0,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-20,1,0.0,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-22,1,0.0,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-23,1,0.0,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [237]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 8822
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   user_id                     8823 non-null   int64         
 1   time_stamp                  8823 non-null   datetime64[ns]
 2   visited                     8823 non-null   int64         
 3   adopted_user                8823 non-null   float64       
 4   creation_time               8823 non-null   object        
 5   name                        8823 non-null   object        
 6   email                       8823 non-null   object        
 7   creation_source             8823 non-null   object        
 8   last_session_creation_time  8823 non-null   float64       
 9   opted_in_to_mailing_list    8823 non-null   int64         
 10  enabled_for_marketing_drip  8823 non-null   int64         
 11  org_id                      8823 non-null   int64       

**Preparing data to have appropriate features**

In [238]:
#changing last session creation time (unix) to datetime
from datetime import datetime as dt
df_all['last_visit']=pd.to_datetime(df_all['last_session_creation_time'], unit='s')

In [239]:
df_all['creation_time'] = pd.to_datetime(df_all['creation_time'])

In [240]:
df_all['creation_time'].head()

0   2014-04-22 03:53:30
1   2013-11-15 03:45:04
2   2013-03-19 23:14:52
3   2013-05-21 08:09:28
4   2013-01-17 10:14:20
Name: creation_time, dtype: datetime64[ns]

In [241]:
df_all['last_visit'].head()

0   2014-04-22 03:53:30
1   2014-03-31 03:45:04
2   2013-03-19 23:14:52
3   2013-05-22 08:09:28
4   2013-01-22 10:14:20
Name: last_visit, dtype: datetime64[ns]

In [242]:
# find the time the customers have been with the company
df_all['time_with_company'] = df_all['last_visit'] - df_all['creation_time']

In [243]:
df_all['time_with_company'].head()

0     0 days
1   136 days
2     0 days
3     1 days
4     5 days
Name: time_with_company, dtype: timedelta64[ns]

In [244]:
#measures the source of signup
df_all['creation_source'].value_counts()

ORG_INVITE            3188
SIGNUP                1898
GUEST_INVITE          1588
SIGNUP_GOOGLE_AUTH    1385
PERSONAL_PROJECTS      764
Name: creation_source, dtype: int64

In [245]:
df_all['opted_in_to_mailing_list'].value_counts()

0    6597
1    2226
Name: opted_in_to_mailing_list, dtype: int64

In [246]:
df_all['org_id'].value_counts()

0      228
1      172
2      150
3      125
4      122
      ... 
322      6
400      6
397      5
386      4
416      2
Name: org_id, Length: 417, dtype: int64

In [247]:
df_all['enabled_for_marketing_drip'].value_counts()

0    7482
1    1341
Name: enabled_for_marketing_drip, dtype: int64

In [285]:
#getting email domain separately from email
x = df_all['email'].str.split('@', 1)

z = []
for i in x:
    z.append(i[1])
    
df_all['email1'] = z

df_all['email1'].value_counts()

gmail.com         2930
yahoo.com         1726
jourrapide.com     883
gustr.com          842
cuvox.de           828
                  ... 
eqrnp.com            1
nxbku.com            1
pybhg.com            1
xdxgl.com            1
ripze.com            1
Name: email1, Length: 812, dtype: int64

In [305]:
#creating email categories
email_cat = []
for i in df_all['email1']:
    if i == 'gmail.com':
        email_cat.append('gmail')
    elif i == 'yahoo.com':
        email_cat.append('yahoo')
    elif i == 'jourrapide.com':
        email_cat.append('jourrapide')
    elif i == 'gustr.com':
        email_cat.append('gustr')
    elif i == 'cuvox.de':
        email_cat.append('cuvox')
    else:
        email_cat.append('Other')

In [307]:
#assigning the appended list to new feature df_all[email_cat]
df_all['email_cat'] = email_cat

In [482]:
df_e = df_all.groupby('email1').agg({'adopted_user':['sum', 'count']})
df_e.columns = df_e.columns.droplevel()



adoption_rate = (df_e['sum']/df_e['count'])*100

df_e['adoption_rate'] = adoption_rate

df_e.sort_values(by='adoption_rate', ascending=False)

Unnamed: 0_level_0,sum,count,adoption_rate
email1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ungoe.com,1.0,1,100.0
evumx.com,1.0,1,100.0
qwkmc.com,1.0,1,100.0
lvwyf.com,1.0,1,100.0
qwmul.com,1.0,1,100.0
...,...,...,...
jnohq.com,0.0,1,0.0
jqevt.com,0.0,1,0.0
jrggl.com,0.0,1,0.0
jvcvb.com,0.0,1,0.0


In [293]:
#Creating categories for organizations the customers are associated with (categories with > 100 counts were given
#separate categories and categories with < 100 were all given with 'Other' category)
org_cat = []
for i in df_all['org_id']:
    if i == 0:
        org_cat.append('0')
    elif i == 1:
        org_cat.append('1')
    elif i == 2:
        org_cat.append('2')
    elif i == 3:
        org_cat.append('3')
    elif i == 4:
        org_cat.append('4')
    elif i == 5:
        org_cat.append('5')
    elif i == 6:
        org_cat.append('6')
    elif i == 7:
        org_cat.append('7')
    else:
        org_cat.append('Other')
    

In [483]:
df_group = df_all.groupby('org_id').agg({'adopted_user':['sum', 'count']})
df_group.columns = df_group.columns.droplevel()



adoption_rate = (df_group['sum']/df_group['count'])*100

df_group['adoption_rate'] = adoption_rate

df_group.sort_values(by='count', ascending=False)


Unnamed: 0_level_0,sum,count,adoption_rate
org_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,10.0,228,4.385965
1,13.0,172,7.558140
2,13.0,150,8.666667
3,13.0,125,10.400000
4,14.0,122,11.475410
...,...,...,...
396,0.0,6,0.000000
322,2.0,6,33.333333
397,1.0,5,20.000000
386,0.0,4,0.000000


In [294]:
#Assigning the above appended list to the new feature column org_cat
df_all['org_cat'] = org_cat

In [295]:
df_all['org_cat'].value_counts()

Other    7749
0         228
1         172
2         150
3         125
4         122
6         102
7          88
5          87
Name: org_cat, dtype: int64

In [296]:
#Measures if user was invited by another user
df_all['invited_by_user_id'].value_counts()

4612.0     10
11770.0     9
2308.0      9
1525.0      8
6808.0      8
           ..
3689.0      1
8847.0      1
11961.0     1
743.0       1
1600.0      1
Name: invited_by_user_id, Length: 2229, dtype: int64

In [297]:
# Created a new dummy feature to indicate that if invited by someone = 1 and if not invited by anyone=0
df_all.loc[(df_all['invited_by_user_id'].isnull()), 'invited_by_someone'] = 0
df_all.loc[(df_all['invited_by_user_id'].notnull()), 'invited_by_someone'] = 1

In [298]:
df_all['invited_by_someone'].value_counts()

1    4776
0    4047
Name: invited_by_someone, dtype: int64

In [299]:
df_all['invited_by_someone'] = df_all['invited_by_someone'].astype(int)

In [300]:
df_all['invited_by_someone'].value_counts()

1    4776
0    4047
Name: invited_by_someone, dtype: int64

Creating dummy variables for categorical variables

In [582]:
df_all1 = pd.get_dummies(df_all, columns=['creation_source', 'org_id', 'email1'],dtype='int')

In [583]:
df_all1.head()

Unnamed: 0,user_id,time_stamp,visited,adopted_user,creation_time,name,email,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,...,email1_zjwjb.com,email1_zkbxm.com,email1_zkbzv.com,email1_zkcdj.com,email1_zkcep.com,email1_zkdih.com,email1_zpbkw.com,email1_zpcop.com,email1_zsrgb.com,email1_zssin.com
0,1,2014-04-23,1,0.0,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,1398139000.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,2013-11-20,1,0.0,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,1396238000.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,2013-03-20,1,0.0,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,1363735000.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,2013-05-22,1,0.0,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,1369210000.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,2013-01-23,1,0.0,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,1358850000.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [584]:
df_all1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 8822
Columns: 1251 entries, user_id to email1_zssin.com
dtypes: datetime64[ns](3), float64(4), int32(1235), int64(4), object(4), timedelta64[ns](1)
memory usage: 43.0+ MB


In [585]:
df_all1['time_with_company'] = df_all1['time_with_company'].astype('string')

#splitting the 'days' part in the column
df_all1['time_with_company'] = df_all1['time_with_company'].str.split(n=0, expand=True)

df_all1['time_with_company'] = df_all1['time_with_company'].astype(float)

print(df_all1.groupby('adopted_user')['time_with_company'].mean())
print(df_all1.groupby('adopted_user')['time_with_company'].min())

adopted_user
0.0      7.197343
1.0    324.854671
Name: time_with_company, dtype: float64
adopted_user
0.0    0.0
1.0    4.0
Name: time_with_company, dtype: float64


It turns out that the adopted users atleast stay for a good 324 days on average and non adopted users stay only for 7 days on average. Even though this feature clearly differentiates between adopters and non-adopters, to predict 'future' users we can't use this feature as this is time bound.
hence for prediction, it's better if we don't use it and have to check if there are other features that could explain predicting adopted users.

In [587]:
#dropping unnecessary features
df_all1 = df_all1.drop(['time_stamp','visited','creation_time','name','email','last_session_creation_time',
                        'invited_by_user_id', 'last_visit', 
                     'user_id', 'time_with_company', 'org_cat', 'e_name', 'email_cat'],axis=1)


In [588]:
df_all1.head()

Unnamed: 0,adopted_user,opted_in_to_mailing_list,enabled_for_marketing_drip,invited_by_someone,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,org_id_0,...,email1_zjwjb.com,email1_zkbxm.com,email1_zkbzv.com,email1_zkcdj.com,email1_zkcep.com,email1_zkdih.com,email1_zpbkw.com,email1_zpcop.com,email1_zsrgb.com,email1_zssin.com
0,0.0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [589]:
df_all1['adopted_user'] =df_all1['adopted_user'].astype('int')

In [590]:
df_all1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 8822
Columns: 1238 entries, adopted_user to email1_zssin.com
dtypes: int32(1236), int64(2)
memory usage: 42.1 MB


**Predicting adopted user with the features selected**

In [591]:
X = df_all1.drop('adopted_user', axis=1)
y = df_all1['adopted_user']

In [592]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=19)

In [593]:
print(y_train.value_counts())
print(y_test.value_counts())

0    5906
1    1152
Name: adopted_user, dtype: int64
0    1472
1     293
Name: adopted_user, dtype: int64


In [594]:
X_train.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,invited_by_someone,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,org_id_0,org_id_1,...,email1_zjwjb.com,email1_zkbxm.com,email1_zkbzv.com,email1_zkcdj.com,email1_zkcep.com,email1_zkdih.com,email1_zpbkw.com,email1_zpcop.com,email1_zsrgb.com,email1_zssin.com
7531,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8348,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3153,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2213,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6045,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [595]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
y_pred_prob = lr.predict_proba(X_train)[:,1]
y_pred_prob1 = lr.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, y_pred_prob)
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_prob1)

print('Logistic regression_train_accuracy is {0:.3f}'.format(accuracy_score(lr.predict(X_train), y_train)))
print('Logistic regression_test_accuracy is {0:.3f}'.format(accuracy_score(lr.predict(X_test), y_test)))
print(confusion_matrix(y_test,y_pred))

Logistic regression_train_accuracy is 0.838
Logistic regression_test_accuracy is 0.833
[[1470    2]
 [ 293    0]]


**Using Random Forest Classifier**

In [596]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=19, class_weight='balanced_subsample')
#class_weight='balanced_subsample' helps balances the frequency 1s and 0s in the sample
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('Accuracy on training set = {}'.format(rf.score(X_train, y_train)))
print('Accuracy on test set = {}'.format(rf.score(X_test, y_test)))

Accuracy on training set = 0.9455936525928025
Accuracy on test set = 0.7943342776203967


In [605]:
print(confusion_matrix(y_test,y_pred_rf))
print(classification_report(y_test,y_pred_rf))

[[1376   96]
 [ 267   26]]
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      1472
           1       0.21      0.09      0.13       293

    accuracy                           0.79      1765
   macro avg       0.53      0.51      0.50      1765
weighted avg       0.73      0.79      0.76      1765



With this RF model, train accuracy = 0.94 and test accuracy = 0.79, meaning there's overfitting. Also the model was able to get only 26 out of 293 adopters correctly, that is the recall was only 0.09. So to improve predicting especially the adopters, RF with hyperparameter tuning was used.

In [598]:
# Hyperparameter tuning with Random Forest
from sklearn.model_selection import GridSearchCV
params = {
    'n_estimators': [100, 200],
    'max_depth': [ 5, 10, 15],
     'min_samples_split': [3, 5], 
    'min_samples_leaf': [1, 2, 4]}



In [599]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=19)
gs = GridSearchCV(rf,params, cv=5,scoring='accuracy')
# Fit the gridsearch model
gs_fit = gs.fit(X_train, y_train)

In [578]:
gs.best_params_

{'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 100}

In [600]:
# with the identified best parameters
rf_t = RandomForestClassifier(random_state=19, max_depth=5, min_samples_leaf = 1, min_samples_split =3, n_estimators = 100, 
                            class_weight='balanced_subsample')

rf_t_fit = rf_t.fit(X_train, y_train)

In [601]:
y_pred_rft = rf_t.predict(X_test)
print('Accuracy on training set = {}'.format(rf_t.score(X_train, y_train)))
print('Accuracy on test set = {}'.format(rf_t.score(X_test, y_test)))


Accuracy on training set = 0.7132332105412298
Accuracy on test set = 0.6781869688385269


In [604]:
print(confusion_matrix(y_test,y_pred_rft))
print(classification_report(y_test,y_pred_rft))

[[1095  377]
 [ 191  102]]
              precision    recall  f1-score   support

           0       0.85      0.74      0.79      1472
           1       0.21      0.35      0.26       293

    accuracy                           0.68      1765
   macro avg       0.53      0.55      0.53      1765
weighted avg       0.75      0.68      0.71      1765



Now the recall of adopters has improved from 0.09 to 0.35, predicting 102 of 363 adopters correctly. With tuning, however, more number of non-adopters have been predicted as adopters, and caused recall of non-adopters to reduce to 0.74 from 0.93.

**Getting Feature Importances**

In [606]:
F_imps = pd.concat([pd.DataFrame(X_test.columns,columns=['features']),pd.DataFrame(np.transpose(rf_t.feature_importances_), columns=['Imps'])], axis = 1)
# Coefficients' values sorted in descending order by their absolute values.
F_imps.reindex(F_imps.Imps.abs().sort_values(ascending=False).index).head(20)

Unnamed: 0,features,Imps
3,creation_source_GUEST_INVITE,0.060314
656,email1_hotmail.com,0.041463
2,invited_by_someone,0.031945
1187,email1_yahoo.com,0.029893
8,org_id_0,0.026771
4,creation_source_ORG_INVITE,0.021521
29,org_id_21,0.019286
6,creation_source_SIGNUP,0.019042
9,org_id_1,0.017507
10,org_id_2,0.017303


**Summary and Conclusion:**


Even though time_with_company or user_history is the main strong predictor or the main differentiator between adopters and non-adopters, we couldn't use it because practically this feaure will not be available for a new customer to predict 'future' adoption. Hence, we had to examine other factors such as org_id, email_domain, if signed_up for marketing emails, sign_up source etc. Although I have tried to reduce the dimension space by creating groups such as org_cat and email_cat for organization and email domains, they were not good enough to predict adopters, hence at the end, I used the default org_id and emails as dummy variables, which improved finding more of adopters. The tuned RF model improved the recall of adopters finally to some extent, . The above feature importances reveal that variables like creation_source_GUEST_INVITE, email1_hotmail.com, invited_by_someone, email1_yahoo.com, org_id, creation_source_ORG_INVITE were some of the most important features in segregating the samples into adopters and non adopters in RF. 

But it seems like the metrics could still be improved with more relevant features like the customers' purpose of signups, time spent in each signup etc. 
Another point to note is that the way we have defined y variable is through an arbitrary approach i.e. if >3 signups per week, they will be adopters. This means, this measure could also be adjusted with different values to get a more accurate model of adopters and non-adopters.