# Import Libraries

In [1]:
import pandas as pd  
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from datetime import datetime, timedelta

%matplotlib inline

import xgboost as xgb 

from sklearn.preprocessing import StandardScaler   
from sklearn.pipeline import Pipeline   
from sklearn.model_selection import RandomizedSearchCV

In [2]:
! pwd
! ls


/Users/marshallbanana/GitHub/Springboard2021/relax_challenge
Relax_challenge_Ed_Gatdula.ipynb takehome_user_engagement.csv
relax_data_science_challenge.pdf takehome_users.csv


# Exercise Summary:

The objective of this exercise is to analyze two datasets, takehome_user_engagement and takehome_users.  takehome_user_engagement consists of datetime information, user_id, and login values.  takehome_users contains data for 12,000 users who signed up for the product in the last two years.  The 10 features in takehome_users include: name, object_id, email, creation_source, creation_time, last_session_creation_time, opted_in_to_mailing_list, enabled_for_marketing_driop, org_id, and invited_by_user_id.



# takehome_user_engagment summary:

1.  207917 entries covering a time period from 2014-04-22 03:53:30 to 2014-01-26 08:57:12 
2.  No missing/NaN values
3.  8823 unique user_id values.  The min user_id value is 1.  The max user_id value is 12000.
4.  user_id 8823 has largest login count at 606.  
5.  2234 user_id values have three or more logins.  1602 of these 2234 user_id qualify as adopted.  Adopted
defined as three or more logins in a period of seven consecutive days.
  

## takehome_user_engagement

In [3]:
df_engagement = pd.read_csv('./takehome_user_engagement.csv', parse_dates= True,index_col='time_stamp')

df_engagement.info()
df_engagement.head()

# 207917 entries
# features:  user_id, visited

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207917 entries, 2014-04-22 03:53:30 to 2014-01-26 08:57:12
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  207917 non-null  int64
 1   visited  207917 non-null  int64
dtypes: int64(2)
memory usage: 4.8 MB


Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


In [4]:
# check for missing, nan values.  none found
# find number of user_id unique values.  8823 unique user_id values

print("NaN/missing values count: {}".format(df_engagement.isna().sum()))
print(df_engagement.user_id.nunique())

NaN/missing values count: user_id    0
visited    0
dtype: int64
8823


In [5]:
# there are 8823 unique user_id values.  the user_id max value is 12000.  the user_id min value is 1.
# this implies that each number in the range of values from 1 to 12000 is not assigned to a user_id.

print("number of unique user_id values: {}".format(df_engagement.user_id.nunique()))

print(df_engagement.user_id.value_counts())
print("user_id maximum value: {}".format(df_engagement.user_id.max()))
print("user_id minimum value: {}".format(df_engagement.user_id.min()))

number of unique user_id values: 8823
3623     606
906      600
1811     593
7590     590
8068     585
        ... 
6763       1
3773       1
5822       1
10040      1
2047       1
Name: user_id, Length: 8823, dtype: int64
user_id maximum value: 12000
user_id minimum value: 1


In [6]:
# there are 8823 unique user_id values.  the user_id max value is 12000.  the user_id min value is 1.
# this implies that each numbers in the range of values from 1 to 12000 is not assigned to a user_id.

# it is possible to determine which numbers in the 1 to 12000 number range are not used in the
#. user_id features.

numbers = list(set(np.arange(1,12000)) - set(df_engagement['user_id']))
len(numbers)


3177

## figuring out which user_id is an adopted user

'adopted user' defined as user who has logged into the product on three separate days in at least one seven-day period

psuedo code-

1.  filter out user_id with less than three visits
2.  for those with more than three visits.  calculate rolling seven day login sum?  if max rolling seven day is greater than or equal to 3, user_id is classified as adopted.

In [7]:
# group df_engagement by 'user_id' and sum up login

df_ = df_engagement.groupby('user_id').sum('visited').sort_values('visited', ascending = False)

# filter out user_id with visited values > 3. 
# create list of user_id values associated with 3 or more visited

id_3plus = df_[df_['visited'] >= 3].index.to_list()
len(id_3plus)

2248

In [8]:
id_3plus

[3623,
 906,
 1811,
 7590,
 8068,
 9402,
 4022,
 3269,
 9312,
 7210,
 8029,
 2474,
 2658,
 8280,
 69,
 5297,
 4019,
 9083,
 2519,
 4263,
 3226,
 10599,
 6312,
 10953,
 2316,
 8426,
 603,
 2078,
 445,
 7536,
 8768,
 11428,
 6171,
 5609,
 7120,
 9281,
 10734,
 8381,
 5939,
 7572,
 5386,
 2622,
 11300,
 1822,
 2771,
 6064,
 4282,
 8912,
 6405,
 5151,
 5682,
 4402,
 5101,
 9728,
 2300,
 1769,
 9450,
 9958,
 4965,
 5240,
 2568,
 8721,
 9434,
 2620,
 4924,
 754,
 5861,
 9691,
 4143,
 6757,
 7511,
 728,
 6204,
 10524,
 8553,
 6282,
 6909,
 7375,
 4186,
 10320,
 9724,
 9325,
 10258,
 11267,
 8297,
 9582,
 10352,
 4489,
 3066,
 605,
 7431,
 4803,
 9286,
 11083,
 9042,
 5965,
 2739,
 11688,
 9601,
 10791,
 1155,
 5415,
 4234,
 9540,
 8310,
 4713,
 10656,
 4181,
 5057,
 2539,
 6978,
 4934,
 4742,
 3411,
 7153,
 6284,
 1202,
 1865,
 4745,
 3957,
 3160,
 3824,
 9771,
 7926,
 1027,
 3175,
 4835,
 63,
 9400,
 8399,
 5507,
 9645,
 783,
 2447,
 10630,
 4837,
 10602,
 4785,
 9694,
 3556,
 4248,
 7107,
 

In [39]:
# using id_3plus to figure out adopted definition.  3 or more logins in 7 day period.

# create function to calculate max value of 7 day rolling window

def max(id):
    #filter df_engagement for id
    return df_engagement[df_engagement['user_id'] == id].visited.rolling('7D').sum().max()


In [45]:
# calculate maximum login count for 7 day period for each of id's in id_3plus

max_logins = [max(x) for x in id_3plus]

In [56]:
# create dataframe using id_3plus and 7 day max values
df_login = pd.DataFrame({'user_id':id_3plus, 'visited':max_logins})

In [57]:
# filter df_login for visited values greater than or equal to 3.  these are the adopted user_id values

df_login[df_logins['visited']>=3]

Unnamed: 0,user_id,visited
0,3623,7.0
1,906,7.0
2,1811,7.0
3,7590,7.0
4,8068,7.0
...,...,...
2130,3222,3.0
2138,5970,3.0
2150,10277,3.0
2167,4093,3.0


In [58]:
# select entries in df_login where the max login count for seven day period is greater than 3
# there are 1602 adopted users.

print("there are {} adopted users.".format(df_login[df_login['visited'] >= 3].nunique()))

# create list of 'adopted' user_id values
adopted =(df_login[df_login['visited'] >= 3].user_id).values.tolist()
adopted

there are user_id    1602
visited       5
dtype: int64 adopted users.


[3623,
 906,
 1811,
 7590,
 8068,
 9402,
 4022,
 3269,
 9312,
 7210,
 8029,
 2474,
 2658,
 8280,
 69,
 5297,
 4019,
 9083,
 2519,
 4263,
 3226,
 10599,
 6312,
 10953,
 2316,
 8426,
 603,
 2078,
 445,
 7536,
 8768,
 11428,
 6171,
 5609,
 7120,
 9281,
 10734,
 8381,
 5939,
 7572,
 5386,
 2622,
 11300,
 1822,
 2771,
 6064,
 4282,
 8912,
 6405,
 5151,
 5682,
 4402,
 5101,
 9728,
 2300,
 1769,
 9450,
 9958,
 4965,
 5240,
 2568,
 8721,
 9434,
 2620,
 4924,
 754,
 5861,
 9691,
 4143,
 6757,
 7511,
 728,
 6204,
 10524,
 8553,
 6282,
 6909,
 7375,
 4186,
 10320,
 9724,
 9325,
 10258,
 11267,
 8297,
 9582,
 10352,
 4489,
 3066,
 605,
 7431,
 4803,
 9286,
 11083,
 9042,
 5965,
 2739,
 11688,
 9601,
 10791,
 1155,
 5415,
 4234,
 9540,
 8310,
 4713,
 10656,
 4181,
 5057,
 2539,
 6978,
 4934,
 4742,
 3411,
 7153,
 6284,
 1202,
 1865,
 4745,
 3957,
 3160,
 3824,
 9771,
 7926,
 1027,
 3175,
 4835,
 63,
 9400,
 8399,
 5507,
 9645,
 783,
 2447,
 10630,
 4837,
 10602,
 4785,
 9694,
 3556,
 4248,
 7107,
 

In [59]:
# create new feature column in df_engagement indicating whether user_id is 'adopted'
# check is user_id is in the adopted list

df_engagement['adopted'] = df_engagement['user_id'].isin(adopted).astype(int)
df_engagement

Unnamed: 0_level_0,user_id,visited,adopted
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-04-22 03:53:30,1,1,0
2013-11-15 03:45:04,2,1,1
2013-11-29 03:45:04,2,1,1
2013-12-09 03:45:04,2,1,1
2013-12-25 03:45:04,2,1,1
...,...,...,...
2013-09-06 06:14:15,11996,1,0
2013-01-15 18:28:37,11997,1,0
2014-04-27 12:45:16,11998,1,0
2012-06-02 11:55:59,11999,1,0


In [60]:
# reset index.  final cleaned form of takehome_user_engagement

df_engagement= df_engagement.rename(columns = {'user_id':'object_id'})
df_engagement = df_engagement.reset_index()
df_engagement

Unnamed: 0,time_stamp,object_id,visited,adopted
0,2014-04-22 03:53:30,1,1,0
1,2013-11-15 03:45:04,2,1,1
2,2013-11-29 03:45:04,2,1,1
3,2013-12-09 03:45:04,2,1,1
4,2013-12-25 03:45:04,2,1,1
...,...,...,...,...
207912,2013-09-06 06:14:15,11996,1,0
207913,2013-01-15 18:28:37,11997,1,0
207914,2014-04-27 12:45:16,11998,1,0
207915,2012-06-02 11:55:59,11999,1,0


# takehome_users dataset summary:

the takehome_users dataset consists of 12000 entries and 10 feature columns.  Two features have significant amount of missing/nana values, last_session_creation_time and invited_by_user_id.  There are several features that contained repeated values.  This is described in the appropriate section below.


10 Features:

1.  object_id:  
    A.  there are 0 NaN values in the 'object_id' column.  
    B.  there are 11355 unique 'object_id' values.  this implies that there are duplicate values which will have to  be identified.  
    C.  there are 71 names that appear more than once.  
    D.  there are 226 object_id values associated with the duplicated names.  perhaps various single user wanted multiple accounts by design? <br>


2.  creation_time:
    A.  0 NaN values
    B.  four creation_time values appear twice. maybe account was created twice by system error?
```
    2014-02-11 17:57:53    2
    2012-09-01 07:22:09    2
    2013-05-25 04:35:52    2
    2012-09-14 21:35:03    2
```

3.  email:  
    A.  0 NaN values  
    B.  Twenty email values appear more than once.  maybe account was created twice by mistake?  40 entries will have to be identified and compared.  


4.  creation_source:

    A.  there are 0 missing/NaN values.  5 unique values.   
    B. value counts:
        ORG_INVITE            4254
        GUEST_INVITE          2163
        PERSONAL_PROJECTS     2111
        SIGNUP                2087
        SIGNUP_GOOGLE_AUTH    1385

5.  last_session_creation_time:  
    A.  There 8823 unique values.  
    B.  3177 missing/NaN values.  Imputation method tbd.
     

6.  opted_in_to_mailing_list:  
    A. 0 missing or NaN values  
    B. 9006 '0' values  
    C. 2994 '1' values  
    

7.  enabled_for_marketing_drop:  
    A.  value counts:
        0    10208
        1     1792
    B.  describe:
        count     12000
        unique        2
        top           0
        freq      10208

8.  org_id:  
    A.  most frequent 'org_id' value is 417.  417 occurs 319 times.  
    B.  see plot for distribution of values.
    
  
9.  invited_b_user_id:  
    A.  2564 unique values  
    B.  5583 entries have missing or NaN values  
    C.  1467 values appear more than once  




In [61]:
df_users = pd.read_csv('./takehome_users.csv', encoding='Latin-1')

df_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [62]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [164]:
df_users.isna().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
date                          3177
dtype: int64

In [63]:
# merge df_engagement with df_users on object_id feature

df_merged = df_engagement.merge(df_users, on='object_id', how='inner')
df_merged.columns

Index(['time_stamp', 'object_id', 'visited', 'adopted', 'creation_time',
       'name', 'email', 'creation_source', 'last_session_creation_time',
       'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id',
       'invited_by_user_id'],
      dtype='object')

In [64]:
df_merged.head()

Unnamed: 0,time_stamp,object_id,visited,adopted,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,2014-04-22 03:53:30,1,1,0,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2013-11-15 03:45:04,2,1,1,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,2013-11-29 03:45:04,2,1,1,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
3,2013-12-09 03:45:04,2,1,1,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
4,2013-12-25 03:45:04,2,1,1,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0


In [141]:
features = df_users.columns.tolist()
features

['object_id',
 'creation_time',
 'name',
 'email',
 'creation_source',
 'last_session_creation_time',
 'opted_in_to_mailing_list',
 'enabled_for_marketing_drip',
 'org_id',
 'invited_by_user_id',
 'date']

In [67]:
df_users.nunique()

object_id                     12000
creation_time                 11996
name                          11355
email                         11980
creation_source                   5
last_session_creation_time     8821
opted_in_to_mailing_list          2
enabled_for_marketing_drip        2
org_id                          417
invited_by_user_id             2564
dtype: int64

## object_id

In [84]:
df_users.object_id.isna().sum(), df_users.creation_time.isna().sum()

(0, 0)

In [87]:
# there are four creation_time values that appear more than once

df_users.creation_time.value_counts()[df_users.creation_time.value_counts() > 1]


2014-02-11 17:57:53    2
2012-09-01 07:22:09    2
2013-05-25 04:35:52    2
2012-09-14 21:35:03    2
Name: creation_time, dtype: int64

## name

In [81]:
# check name features for missing values, number of unique values.  There are 0 NaN/missing values in 'name' column
# there are 11355 unique name values.

print("there are {} NaN values in the 'name' column".format(df_users['name'].isna().sum()))
print('there are {} unique name values'.format(df_users['name'].nunique()))

# 11355 unique names values for 12000 entries.  find duplicated names and associated entries.
# values with more than 2 or more occurences in 'name' columns

names_2plus = df_users['name'].value_counts()[df_users['name'].value_counts() > 2].keys().to_list()
print("there are {} name values that appear at least more than once.".format(len(names_2plus)))

# filter df_users for name values that occur more than once.
# are the entries with duplicated names identical?  object_id, creation_time, email values are not duplicated
df_ = df_users[df_users['name'].isin(names_2plus)].sort_values('name')
df_.info()
df_.nunique()

there are 0 NaN values in the 'name' column
there are 11355 unique name values
there are 71 name values that appear at least more than once.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 226 entries, 2944 to 10631
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   226 non-null    int64         
 1   creation_time               226 non-null    object        
 2   name                        226 non-null    object        
 3   email                       226 non-null    object        
 4   creation_source             226 non-null    object        
 5   last_session_creation_time  166 non-null    float64       
 6   opted_in_to_mailing_list    226 non-null    int64         
 7   enabled_for_marketing_drip  226 non-null    int64         
 8   org_id                      226 non-null    int64         
 9   invited_by_user_id          118 non-null

object_id                     226
creation_time                 226
name                           71
email                         226
creation_source                 5
last_session_creation_time    166
opted_in_to_mailing_list        2
enabled_for_marketing_drip      2
org_id                        163
invited_by_user_id            113
date                          166
dtype: int64

## creation_time

In [69]:
# Based on df_users 12000 entries and number of unique values in a given columns.  
# There are duplicated values in creation_time and email columns

print('there are {} unique create_time values'.format(df_users['creation_time'].nunique()))
print('there are {} unique email values'.format(df_users['email'].nunique()))

there are 11996 unique create_time values
there are 11980 unique email values


In [70]:
df_users['creation_time'].value_counts().head(5)

2014-02-11 17:57:53    2
2012-09-01 07:22:09    2
2013-05-25 04:35:52    2
2012-09-14 21:35:03    2
2012-12-23 01:50:01    1
Name: creation_time, dtype: int64

In [156]:
# convert unix timestamp to datetimetime
df_users['date'] = pd.to_datetime(df_users['last_session_creation_time'],unit='s')

df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   12000 non-null  int64         
 1   creation_time               12000 non-null  object        
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  8823 non-null   float64       
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  category      
 8   org_id                      12000 non-null  category      
 9   invited_by_user_id          6417 non-null   float64       
 10  date                        8823 non-null   datetime64[ns]
dtypes: category(2), datetime64[ns](1), float64(2), int64(2

## email

In [96]:
email_2plus = df_users['email'].value_counts()[df_users['email'].value_counts() >= 2]
email_2plus

print("{} email values appear more than once".format(len(email_2plus)))

df_users['email'].value_counts()[df_users['email'].value_counts() >= 2]

20 email values appear more than once


LaerkeMBertelsen@gmail.com       2
UlrikeGerste@gmail.com           2
TomSchiffer@gmail.com            2
JaninaSankt@gmail.com            2
MagnusSWinther@jourrapide.com    2
NicolaiSHolm@yahoo.com           2
MarkoSeiler@yahoo.com            2
LasseLNrgaard@gmail.com          2
KristinKappel@yahoo.com          2
MimirMKarlsen@jourrapide.com     2
MaximilianWalter@gmail.com       2
JacobTye@gmail.com               2
MandySchroder@gustr.com          2
AmandaABach@gmail.com            2
KlausMueller@gustr.com           2
LeonieDuerr@gmail.com            2
KerstinNeudorf@gmail.com         2
ThomasBrandt@gmail.com           2
AlfieLane@yahoo.com              2
PaigeWyatt@gmail.com             2
Name: email, dtype: int64

In [72]:
# isolate domain names. for email values split at '@'.   there are 1184 unique domain names
# example: @yahoo.com
email = df_users['email'].str.split('@',expand=True)
email[1].value_counts()

#email[1].value_counts()[email[1].value_counts() > 1]


gmail.com         3562
yahoo.com         2447
jourrapide.com    1259
cuvox.de          1202
gustr.com         1179
                  ... 
wctom.com            1
korrl.com            1
vlusc.com            1
tdife.com            1
nruyi.com            1
Name: 1, Length: 1184, dtype: int64

## last_session_creation_time

In [124]:
df_users['last_session_creation_time'].nunique()

8821

In [125]:
# missing or NaN values?  3177.

df_users['last_session_creation_time'].isna().sum()

3177

## opted_in_to_mailing_list

In [130]:
# 9006 '0' values
# 2994 '1' values

df_users['opted_in_to_mailing_list'].value_counts()
    

0    9006
1    2994
Name: opted_in_to_mailing_list, dtype: int64

In [135]:
df_users['opted_in_to_mailing_list'].isna().sum()


0

## creation source

In [73]:
# creation source feature values.  five unique types.

print("there are {} missing/NaN values \n".format(df_users['creation_source'].isna().sum()))
print(df_users['creation_source'].value_counts())

df_users['creation_source'].astype('category').describe()


there are 0 missing/NaN values 

ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64


count          12000
unique             5
top       ORG_INVITE
freq            4254
Name: creation_source, dtype: object

## org_id 

In [140]:
# org_id feature values. 417 unique org_id values.
df_users['org_id'] = df_users['org_id'].astype('category')
df_users['org_id'].value_counts()
#df_users['org_id'].describe()


#df_users['org_id'].value_counts().sort_values(ascending=False).plot(figsize=(10,10));

0      319
1      233
2      201
3      168
4      159
      ... 
355      9
400      8
397      8
386      7
416      2
Name: org_id, Length: 417, dtype: int64

In [121]:
# most frequent 'org_id' value is 417.  417 occurs 319 times.
df_users['org_id'].describe()

count     12000
unique      417
top           0
freq        319
Name: org_id, dtype: int64

In [76]:
# rename object_id, name columns to user_id, user_name respectively.

df_users.rename(columns = {'object_id':'user_id', 'name':'user_name'})

Unnamed: 0,user_id,creation_time,user_name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1.398139e+09,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1.363735e+09,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1.369210e+09,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1.358850e+09,0,0,193,5240.0
...,...,...,...,...,...,...,...,...,...,...
11995,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1.378448e+09,0,0,89,8263.0
11996,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1.358275e+09,0,0,200,
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1.398603e+09,1,1,83,8074.0
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1.338638e+09,0,0,6,


## invited_by_user_id

In [160]:
# 2564 unique values

df_users['invited_by_user_id'].nunique()

2564

In [161]:
#  5583 entries have missing or NaN values

df_users['invited_by_user_id'].isna().sum()


5583

In [163]:
# 1467 values appear more than once

df_users['invited_by_user_id'].value_counts()[df_users['invited_by_user_id'].value_counts() > 1]


10741.0    13
2527.0     12
2308.0     11
1525.0     11
11770.0    11
           ..
8292.0      2
5046.0      2
9532.0      2
1747.0      2
9052.0      2
Name: invited_by_user_id, Length: 1467, dtype: int64

## enabled_for_marketing_drip

In [154]:
df_users['enabled_for_marketing_drip']= df_users['enabled_for_marketing_drip'].astype('category')

In [150]:
df_users['enabled_for_marketing_drip'].value_counts()

0    10208
1     1792
Name: enabled_for_marketing_drip, dtype: int64

In [155]:
df_users['enabled_for_marketing_drip'].describe()

count     12000
unique        2
top           0
freq      10208
Name: enabled_for_marketing_drip, dtype: int64