In [1]:
import pandas as pd
from datetime import timedelta
from collections import Counter

In [2]:
engagements = pd.read_csv('relax_challenge/takehome_user_engagement.csv', parse_dates=['time_stamp'])
engagements.info()
engagements

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null datetime64[ns]
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
...,...,...,...
207912,2013-09-06 06:14:15,11996,1
207913,2013-01-15 18:28:37,11997,1
207914,2014-04-27 12:45:16,11998,1
207915,2012-06-02 11:55:59,11999,1


In [3]:
users = pd.read_csv('takehome_users.csv', encoding='latin')
users.info()
users

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1.398139e+09,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1.363735e+09,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1.369210e+09,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1.358850e+09,0,0,193,5240.0
...,...,...,...,...,...,...,...,...,...,...
11995,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1.378448e+09,0,0,89,8263.0
11996,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1.358275e+09,0,0,200,
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1.398603e+09,1,1,83,8074.0
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1.338638e+09,0,0,6,


In [4]:
engagements.index = engagements.time_stamp

In [5]:
def is_engaged(user):
    """
    map function to each user in users table
    takes a user_id
    computes engagement using the engagements table
    returns a boolean
    True: user active on 3 unique days in a 7 day period
    """
    user_eng = engagements[engagements.user_id == user]
    #user_eng = user_eng.resample('D', how='count')
    user_eng = user_eng.groupby(by=user_eng.time_stamp.dt.date).count()

    engaged = False
    
    if user_eng.shape[0] >= 3:
        # sort user engagements by datetime index
        user_eng = user_eng.sort_index()

        for i in range(len(user_eng.time_stamp)-3):
            engaged = (user_eng.index[i+2] - user_eng.index[i])<timedelta(days=7)
            if engaged:
                break
    return engaged

In [6]:
users['adopted_user'] = users.object_id.map(is_engaged)


In [7]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


In [8]:
predictors = ['creation_source', 'creation_time', 
              'last_session_creation_time',
              'opted_in_to_mailing_list', 
              'enabled_for_marketing_drip', 
              'org_id', 'invited_by_user_id']

target = ['adopted_user']

categorical = ['creation_source', 'org_id']

In [9]:
lasso_df = users[predictors + target]
lasso_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
creation_source               12000 non-null object
creation_time                 12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
adopted_user                  12000 non-null bool
dtypes: bool(1), float64(2), int64(3), object(2)
memory usage: 668.1+ KB


In [10]:
lasso_df.invited_by_user_id = lasso_df.invited_by_user_id.fillna(0)
lasso_df.invited_by_user_id = (lasso_df.invited_by_user_id > 0) *1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [11]:
set(lasso_df.creation_source)

{'GUEST_INVITE',
 'ORG_INVITE',
 'PERSONAL_PROJECTS',
 'SIGNUP',
 'SIGNUP_GOOGLE_AUTH'}

In [15]:
org_counts = Counter()
for org in lasso_df.org_id:
    org_counts[org] += 1
    
lasso_df['users_in_org'] = lasso_df.org_id.map(org_counts)

adopted_users = lasso_df[lasso_df['adopted_user']==1]
org_counts_adopted_users = Counter()
for org in adopted_users.org_id:
    org_counts_adopted_users[org] += 1

lasso_df['adopted_users_in_org'] = lasso_df.org_id.map(org_counts_adopted_users)


AttributeError: 'DataFrame' object has no attribute 'org_id'

In [16]:
def replace_with_dummies(df, columns):
    """
    Takes a df and a list of categorial columns with discrete values
    Returns a df with categorical columns replaced with binary columns
    """
    for col in columns:
        temp_df= df[[col]]
        temp_df = pd.get_dummies(temp_df)
        df[temp_df.columns] = temp_df
        df = df.drop(col, axis=1)
    return df

In [17]:
lasso_df = replace_with_dummies(lasso_df, categorical)
lasso_df.info()

KeyError: "None of [Index(['creation_source'], dtype='object')] are in the [columns]"