In [187]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime

In [181]:
def zeroDiagonal(dataTable, dimension):
    ''' Replace Dataframe n x n matrix diagonal values with zero '''
    for i in range(dimension):
        dataTable.iloc[i,i]=0

    return(dataTable)

## <center> Read User Engagement Dataset

In [2]:
user_engagement = pd.read_csv('takehome_user_engagement.csv')
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


### View  user_engagement DataFrame statistical info - describe() and info()

In [35]:
user_engagement_sorted = user_engagement.sort_values(by = 'time_stamp', ascending = False)
user_engagement_sorted.describe()

Unnamed: 0,user_id,visited
count,207917.0,207917.0
mean,5913.314197,1.0
std,3394.941674,0.0
min,1.0,1.0
25%,3087.0,1.0
50%,5682.0,1.0
75%,8944.0,1.0
max,12000.0,1.0


In [36]:
user_engagement_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207917 entries, 70763 to 178140
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 6.3+ MB


In [195]:
# groupby user_id calculate sum of all logins per user and remove records with NaN
user_log = user_engagement.groupby('user_id').sum().dropna()
# filter by 3 or more user logins
active = user_log[user_log>=3].dropna()
active.head()

Unnamed: 0_level_0,visited
user_id,Unnamed: 1_level_1
2,14.0
10,284.0
20,7.0
33,18.0
42,342.0


## <center> Read Users Dataset

In [5]:
# use latin encoding to read european characters
users = pd.read_csv('takehome_users.csv', encoding = 'latin')

In [6]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


### View  users DataFrame statistical info - describe() and info()

In [39]:
users.describe()

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
count,12000.0,8823.0,12000.0,12000.0,12000.0,6417.0
mean,6000.5,1379279000.0,0.2495,0.149333,141.884583,5962.957145
std,3464.24595,19531160.0,0.432742,0.356432,124.056723,3383.761968
min,1.0,1338452000.0,0.0,0.0,0.0,3.0
25%,3000.75,1363195000.0,0.0,0.0,29.0,3058.0
50%,6000.5,1382888000.0,0.0,0.0,108.0,5954.0
75%,9000.25,1398443000.0,0.0,0.0,238.25,8817.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0


In [40]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [157]:
# group by creation_source and report count for each feature
source = users.groupby('creation_source').count()
source

Unnamed: 0_level_0,object_id,creation_time,name,email,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
creation_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GUEST_INVITE,2163,2163,2163,2163,1588,2163,2163,2163,2163
ORG_INVITE,4254,4254,4254,4254,3188,4254,4254,4254,4254
PERSONAL_PROJECTS,2111,2111,2111,2111,764,2111,2111,2111,0
SIGNUP,2087,2087,2087,2087,1898,2087,2087,2087,0
SIGNUP_GOOGLE_AUTH,1385,1385,1385,1385,1385,1385,1385,1385,0


### From grouping the data frame by 'creation_source' and collecting the count, most accounts originated from organization invite (4254 users). Followed by guest invite to an organization (2163 users) and personal project accounts (2111 users). Lowest login rate is for personal projects (764 users) which may infer less likely for accounts for personal use will have "adopted user".

***

In [182]:
# Replace Dataframe n x n matrix diagonal values with zero to allow other features correlation amplified
sourceCorr = zeroDiagonal(source.corr(),9)
sourceCorr

Unnamed: 0,object_id,creation_time,name,email,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
object_id,0.0,1.0,1.0,1.0,0.851242,1.0,1.0,1.0,0.891571
creation_time,1.0,0.0,1.0,1.0,0.851242,1.0,1.0,1.0,0.891571
name,1.0,1.0,0.0,1.0,0.851242,1.0,1.0,1.0,0.891571
email,1.0,1.0,1.0,0.0,0.851242,1.0,1.0,1.0,0.891571
last_session_creation_time,0.851242,0.851242,0.851242,0.851242,0.0,0.851242,0.851242,0.851242,0.829084
opted_in_to_mailing_list,1.0,1.0,1.0,1.0,0.851242,0.0,1.0,1.0,0.891571
enabled_for_marketing_drip,1.0,1.0,1.0,1.0,0.851242,1.0,0.0,1.0,0.891571
org_id,1.0,1.0,1.0,1.0,0.851242,1.0,1.0,0.0,0.891571
invited_by_user_id,0.891571,0.891571,0.891571,0.891571,0.829084,0.891571,0.891571,0.891571,0.0


#### As illusterated in correlation table above, features are highly correlated.

***

### Overall, without indepth analysis, it appears that the users associated with an organiztion as a guest or regular user are more likely to be "adopted users".