In [34]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
users = pd.read_csv('C:/Users/benja/Dropbox/Ben Code/Springboard not GIT linked/1481069814_relax_challenge/relax_challenge/takehome_users.csv', encoding = "ISO-8859-1")
logins = pd.read_csv('C:/Users/benja/Dropbox/Ben Code/Springboard not GIT linked/1481069814_relax_challenge/relax_challenge/takehome_user_engagement.csv', encoding = "ISO-8859-1")

In [3]:
#Let's inspect the data a bit:
logins.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [4]:
#check data types
logins.dtypes

time_stamp    object
user_id        int64
visited        int64
dtype: object

In [5]:
#convert time_stamp to datetime object
logins['time_stamp'] = pd.to_datetime(logins['time_stamp'])

In [6]:
logins.dtypes

time_stamp    datetime64[ns]
user_id                int64
visited                int64
dtype: object

In [7]:
#Check for null values (if any)
logins.isnull().any()

time_stamp    False
user_id       False
visited       False
dtype: bool

In [8]:
#Make the time_stamp column the index
logins.set_index('time_stamp', inplace = True)

In [9]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [10]:
#Investigate data types
users.dtypes

object_id                       int64
creation_time                  object
name                           object
email                          object
creation_source                object
last_session_creation_time    float64
opted_in_to_mailing_list        int64
enabled_for_marketing_drip      int64
org_id                          int64
invited_by_user_id            float64
dtype: object

In [19]:
users['creation_time'] = pd.to_datetime(users['creation_time'])

In [20]:
#Check for null values by column
users.isnull().any()

object_id                     False
creation_time                 False
name                          False
email                         False
creation_source               False
last_session_creation_time     True
opted_in_to_mailing_list      False
enabled_for_marketing_drip    False
org_id                        False
invited_by_user_id             True
dtype: bool

In [12]:
#Count total null values in 'last_session_creation_time'
users['last_session_creation_time'].isnull().values.sum()

3177

In [13]:
#Count total null values in invited_by_user_id
users['invited_by_user_id'].isnull().values.sum()

5583

In [22]:
#Let's convert last_session_creation_time to date time format and 
#fill the null values in with the timestamp from creation_time
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], unit = 's', errors = 'coerce')


In [26]:
users['last_session_creation_time'].fillna(users.creation_time, inplace = True)

In [27]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0


In [28]:
#Let's fill in the null values for invited by user id, since these users weren't invited by another it's reasonable
#to keep the value as null, but this will impede further data analysis, so we'll change it to 0 instead
users['invited_by_user_id'].fillna('NA', inplace = True)

In [29]:
#Check for null values by column
users.isnull().any()

object_id                     False
creation_time                 False
name                          False
email                         False
creation_source               False
last_session_creation_time    False
opted_in_to_mailing_list      False
enabled_for_marketing_drip    False
org_id                        False
invited_by_user_id            False
dtype: bool

In [52]:
#Create y-column for RF model based on number of logins within a 7-day period
def adopted(df, days = 7, logins = 3):
    df['date'] = df.index.date
    df.drop_duplicates(subset = 'date').sort_values('date')
    passed_days = df['date'].diff(periods = logins - 1)
    return any(passed_days <= timedelta(days = days))

adopted_user = logins.groupby('user_id').apply(adopted)
adopted_user.name = 'adopted_user'
adopted_user.head()

user_id
1    False
2     True
3    False
4    False
5    False
Name: adopted_user, dtype: bool

In [53]:
adopted_user_df = pd.DataFrame(columns = ['user_id', 'adopted'])

In [55]:
adopted_user_df.user_id = np.arange(len(adopted_user))
adopted_user_df.adopted = adopted_user

In [56]:
adopted_user_df.head()

Unnamed: 0,user_id,adopted
0,0,
1,1,False
2,2,True
3,3,False
4,4,False


In [58]:
adopted_user_df.dropna(how = 'any')

Unnamed: 0,user_id,adopted
1,1,False
2,2,True
3,3,False
4,4,False
5,5,False
...,...,...
8811,8811,True
8816,8816,False
8818,8818,False
8821,8821,False


In [85]:
df = pd.merge(users, adopted_user_df, left_on = 'object_id', right_on = 'user_id', how = 'left')

In [86]:
df.user_id.fillna(df['object_id'], inplace = True)
df.adopted.fillna('False', inplace = True)

In [87]:
df.isnull().any()

object_id                     False
creation_time                 False
name                          False
email                         False
creation_source               False
last_session_creation_time    False
opted_in_to_mailing_list      False
enabled_for_marketing_drip    False
org_id                        False
invited_by_user_id            False
user_id                       False
adopted                       False
dtype: bool

In [88]:
df['adopted'] = df['adopted'].astype('str')

In [89]:
df['adopted'].replace({"True": "Yes", "False": "No"}, inplace = True)

In [90]:
df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803,1.0,No
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316,2.0,Yes
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525,3.0,No
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151,4.0,No
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240,5.0,No


In [92]:
y = df['adopted']

x = df.drop(['user_id', 'object_id', 'adopted'], axis = 1)
x = pd.get_dummies(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)

In [93]:
clf = RandomForestClassifier(n_estimators = 100)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

TypeError: invalid type promotion

In [None]:
feature_imp = pd.Series(clf.feature_importances_, index = x.columns.tolist()).sort_values(ascending = False)
feature_imp

In [None]:
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()