In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
user_engage = pd.read_csv('../input/relax-datachallenge/takehome_user_engagement.csv')
users = pd.read_csv('../input/relax-datachallenge/takehome_users.csv', encoding='latin-1')

In [None]:
users.head()

In [None]:
user_engage.head()

In [None]:
user_engage['time_stamp'] = pd.to_datetime(user_engage['time_stamp'])
user_engage.head()

In [None]:
user_engage_weekly = user_engage.groupby(['user_id', pd.Grouper(key='time_stamp', freq='W')])['visited'].sum().reset_index()
user_engage_weekly.head()

In [None]:
# there are 33859 "adopted users"
user_engage_weekly[user_engage_weekly['visited'] >= 3]

In [None]:
user_engage_weekly = user_engage_weekly.groupby('user_id').apply(lambda x: x.sort_values('visited', ascending=False)).drop('user_id', axis=1).reset_index(level='user_id')
user_engage_weekly.drop_duplicates(subset='user_id', keep='first', inplace=True)

In [None]:
# merge the dataframe with visited count with the users dataframe
df = user_engage_weekly.merge(users, how='right', left_on='user_id', right_on='object_id')
df.head()

In [None]:
df['visited'].value_counts()

In [None]:
df.sample(100)

In [None]:
# there are only 8823 values in 'visited' so need to replace 3177 NaN values with 0 because those users did not visit the website
df['visited'] = df['visited'].replace(np.nan, 0)
df['last_session_creation_time'] = df['last_session_creation_time'].replace(np.nan, 0)

In [None]:
df['visited'].value_counts()

In [None]:
# create target variable
df['adopted_user'] = df['visited'].apply(lambda x: 1 if x >= 3 else 0)
df['adopted_user'].value_counts()

In [None]:
df = pd.get_dummies(df, prefix='', columns=['creation_source'])
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler

#df['last_session_creation_time'] = StandardScaler().fit_transform(df['last_session_creation_time'].values.reshape(-1, 1))
df['org_id'] = StandardScaler().fit_transform(df['org_id'].values.reshape(-1, 1))

In [None]:
df.head()

In [None]:
X = df[['last_session_creation_time', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id', '_GUEST_INVITE', '_ORG_INVITE', '_PERSONAL_PROJECTS', '_SIGNUP', '_SIGNUP_GOOGLE_AUTH']]

y = df['adopted_user']

In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

rf = RandomForestClassifier()

rf.fit(X, y)
# get importance
importance = rf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
# remove 'last session creation time' and 'org id' to view importance amoung other features
X = df[['opted_in_to_mailing_list', 'enabled_for_marketing_drip', '_GUEST_INVITE', '_ORG_INVITE', '_PERSONAL_PROJECTS', '_SIGNUP', '_SIGNUP_GOOGLE_AUTH']]

y = df['adopted_user']

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

rf = RandomForestClassifier()

rf.fit(X, y)
# get importance
importance = rf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()