# Capstone Project

## 1. Ingesting data

In [1]:
import pandas as pd
import numpy as np
import time
import logging
import os
import sys
from datetime import datetime

In [2]:
t = time.time()
auth_user = pd.read_csv('data/ebdb_public_auth_user.csv')
print("auth_user loaded: %d lines in %f seconds" % (auth_user.shape[0], time.time() - t))

auth_user loaded: 71638 lines in 0.355475 seconds


In [3]:
t = time.time()
payment_app_product = pd.read_csv('data/ebdb_public_payment_app_product.csv')
print("payment_app_product loaded: %d lines in %f seconds" % (payment_app_product.shape[0], time.time() - t))

payment_app_product loaded: 29 lines in 0.003909 seconds


In [4]:
t = time.time()
payment_app_subscription = pd.read_csv('data/ebdb_public_payment_app_historicalsubscription.csv')
print("payment_app_subscription loaded: %d lines in %f seconds" % (payment_app_subscription.shape[0], time.time() - t))

payment_app_subscription loaded: 42172 lines in 0.410668 seconds


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
t = time.time()
frontend_brazil_pages = pd.read_csv('data/analytics_frontend_brazil_pages.csv')
print("frontend_brazil_pages loaded: %d lines in %f seconds" % (frontend_brazil_pages.shape[0], time.time() - t))

  interactivity=interactivity, compiler=compiler, result=result)


frontend_brazil_pages loaded: 5747993 lines in 113.310169 seconds


In [6]:
frontend_brazil_pages['anonymous_id'].nunique()

1557111

In [7]:
t = time.time()
frontend_brazil_identifies = pd.read_csv('data/analytics_frontend_brazil_identifies.csv')
print("frontend_brazil_identifies loaded: %d lines in %f seconds" % (frontend_brazil_identifies.shape[0], time.time() - t))

frontend_brazil_identifies loaded: 141357 lines in 1.793472 seconds


  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
t = time.time()
frontend_brazil_tracks = pd.read_csv('data/analytics_frontend_brazil_tracks.csv')
print("frontend_brazil_tracks loaded: %d lines in %f seconds" % (frontend_brazil_tracks.shape[0], time.time() - t))

  interactivity=interactivity, compiler=compiler, result=result)


frontend_brazil_tracks loaded: 1343225 lines in 15.711704 seconds


In [9]:
t = time.time()
brazil_events_signup = pd.read_csv('data/analytics_brazil_events_event_sign_up.csv')
print("brazil_events_signup loaded: %d lines in %f seconds" % (brazil_events_signup.shape[0], time.time() - t))

brazil_events_signup loaded: 31030 lines in 0.628085 seconds


  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
t = time.time()
analytics_tables_course_enrollments = pd.read_csv('data/analytics_analytics_tables_course_enrollments_br.csv')
print("analytics_tables_course_enrollments loaded: %d lines in %f seconds" % (analytics_tables_course_enrollments.shape[0], time.time() - t))

analytics_tables_course_enrollments loaded: 187952 lines in 0.708614 seconds


In [11]:
t = time.time()
zendesk_data = pd.read_csv('data/zendesk_export.csv')
print("zendesk_data loaded: %d lines in %f seconds" % (zendesk_data.shape[0], time.time() - t))

zendesk_data loaded: 11829 lines in 0.119366 seconds


In [12]:
t = time.time()
accounts_data = pd.read_csv('data/analytics_analytics_tables_accounts.csv')
print("analytics_tables_accounts loaded: %d lines in %f seconds" % (accounts_data.shape[0], time.time() - t))

  interactivity=interactivity, compiler=compiler, result=result)


analytics_tables_accounts loaded: 5422609 lines in 33.633682 seconds


## 2. Preprocess Features

### 2.1. First step: *email*, *is_paying_student* and *register_date*

In [13]:
paying_students = payment_app_subscription.loc[lambda df: df['status'].isin(['active', 'payment_credit_retry'])]
paying_students.shape

(18692, 43)

In [14]:
paying_students = paying_students.groupby('user_id').first()[['id', 'register_date']]
paying_students.reset_index(level=0, inplace=True)
paying_students.head()

Unnamed: 0,user_id,id,register_date
0,7,670,2016-08-16 18:32:12.533034
1,13,1482,2016-10-08 23:05:24.148811
2,20,3436,2017-01-06 22:07:58.437857 -02:00
3,28,1656,2016-10-20 11:32:45.037242 -02:00
4,41,1397,2016-10-03 20:41:56.352383


In [15]:
all_users = auth_user[['id', 'email', 'first_name', 'last_name', 'date_joined']]
all_users.columns = ['user_id', 'email', 'first_name', 'last_name', 'date_joined']
all_users.head()

Unnamed: 0,user_id,email,first_name,last_name,date_joined
0,39,vinicius.zilinski@gmail.com,Vinicius,Zilinski,2016-06-23 18:10:54.299931
1,6,leeo.apc@gmail.com,Leo,A.c,2016-06-21 23:18:50.152776
2,8,sardinha.dias@gmail.com,Carlos,sardinha dias,2016-06-22 00:22:49.282586
3,10,thiagonv@gmail.com,Thiago,Vilela,2016-06-22 10:57:22.246017
4,11,ygorats@gmail.com,Ygor,Santos,2016-06-22 14:06:45.355805


In [16]:
all_users_us = accounts_data[['user_id', 'email', 'first_name', 'last_name', 'created_at']]

test = all_users.join(all_users_us[['email', 'created_at']].set_index('email'), on='email')
test['date_joined'] = test.apply(lambda row: datetime.strptime(row['date_joined'].split(".")[0], '%Y-%m-%d %H:%M:%S'), axis=1)
test['created_at'] = test['created_at'].fillna(str(datetime.now()))
test['created_at'] = test.apply(lambda row: datetime.strptime(row['created_at'].split(".")[0], '%Y-%m-%d %H:%M:%S'), axis=1)
test['min_date'] = test.apply(lambda row: row['date_joined'] if row['date_joined'] <= row['created_at'] else row['created_at'], axis=1)
test.shape

(72374, 7)

In [17]:
all_users = test[['user_id', 'email', 'first_name', 'last_name', 'min_date']]
all_users.rename(columns={'min_date': 'date_joined'}, inplace=True)
all_users.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


Unnamed: 0,user_id,email,first_name,last_name,date_joined
0,39,vinicius.zilinski@gmail.com,Vinicius,Zilinski,2016-06-21 19:32:27
1,6,leeo.apc@gmail.com,Leo,A.c,2013-12-30 01:16:23
2,8,sardinha.dias@gmail.com,Carlos,sardinha dias,2016-05-22 03:19:23
3,10,thiagonv@gmail.com,Thiago,Vilela,2012-10-22 01:19:56
4,11,ygorats@gmail.com,Ygor,Santos,2016-02-18 21:30:34


In [18]:
all_users.loc[lambda df: df['date_joined'].isnull()].shape[0]

0

In [19]:
def is_paying_student(row):
    if np.isnan(row['subscription_id']):
        return 0
    else:
        return 1

x = all_users.join(paying_students.set_index('user_id'), on='user_id')
x.rename(columns={'id': 'subscription_id'}, inplace=True)
x['is_paying_student'] = x.apply(lambda row: is_paying_student(row), axis=1)
x.head(10)

Unnamed: 0,user_id,email,first_name,last_name,date_joined,subscription_id,register_date,is_paying_student
0,39,vinicius.zilinski@gmail.com,Vinicius,Zilinski,2016-06-21 19:32:27,,,0
1,6,leeo.apc@gmail.com,Leo,A.c,2013-12-30 01:16:23,,,0
2,8,sardinha.dias@gmail.com,Carlos,sardinha dias,2016-05-22 03:19:23,,,0
3,10,thiagonv@gmail.com,Thiago,Vilela,2012-10-22 01:19:56,,,0
4,11,ygorats@gmail.com,Ygor,Santos,2016-02-18 21:30:34,,,0
5,13,lazaro_neto_@hotmail.com,Lazaro Lacir,Moraes Neto,2016-06-16 13:28:27,1482.0,2016-10-08 23:05:24.148811,1
6,15,jonatacw@gmail.com,Jonata,Wieczynski,2016-06-22 17:35:10,,,0
7,17,laertejjunior@gmail.com,Laerte,Mercier,2016-06-13 21:59:01,,,0
8,19,victormalagoni@hotmail.com,Victor,Pires Malagoni,2016-06-22 19:33:19,,,0
9,20,a30673083@gmail.com,Jose,Ramirez,2012-02-07 15:08:46,3436.0,2017-01-06 22:07:58.437857 -02:00,1


### 2.2. Webinar Enrollments

In [20]:
webinar_enrollments = brazil_events_signup[['email', 'enrollment_date']]
webinar_enrollments = webinar_enrollments.groupby('email').count()
webinar_enrollments.reset_index(level=0, inplace=True)
webinar_enrollments.rename(columns={'enrollment_date': 'webinar_enrollments'}, inplace=True)
webinar_enrollments.head(10)

Unnamed: 0,email,webinar_enrollments
0,0.adrian.axel@gmail.com,4
1,0matheus.araujo0@gmail.com,1
2,111@111.com,1
3,1162.eoj@gmail.com,1
4,130577er@gmail.com,1
5,146050u54@gmail.com,1
6,15cocosantos@gmail.com,1
7,15thlima@gmail.com,1
8,17salvador@gmail.com,5
9,1995.guilherme@gmail.com,1


In [21]:
x = x.join(webinar_enrollments.set_index('email'), on='email')

In [22]:
x['webinar_enrollments'] = x['webinar_enrollments'].fillna(0).astype(int)
x.head(10)

Unnamed: 0,user_id,email,first_name,last_name,date_joined,subscription_id,register_date,is_paying_student,webinar_enrollments
0,39,vinicius.zilinski@gmail.com,Vinicius,Zilinski,2016-06-21 19:32:27,,,0,0
1,6,leeo.apc@gmail.com,Leo,A.c,2013-12-30 01:16:23,,,0,1
2,8,sardinha.dias@gmail.com,Carlos,sardinha dias,2016-05-22 03:19:23,,,0,0
3,10,thiagonv@gmail.com,Thiago,Vilela,2012-10-22 01:19:56,,,0,0
4,11,ygorats@gmail.com,Ygor,Santos,2016-02-18 21:30:34,,,0,1
5,13,lazaro_neto_@hotmail.com,Lazaro Lacir,Moraes Neto,2016-06-16 13:28:27,1482.0,2016-10-08 23:05:24.148811,1,0
6,15,jonatacw@gmail.com,Jonata,Wieczynski,2016-06-22 17:35:10,,,0,0
7,17,laertejjunior@gmail.com,Laerte,Mercier,2016-06-13 21:59:01,,,0,0
8,19,victormalagoni@hotmail.com,Victor,Pires Malagoni,2016-06-22 19:33:19,,,0,0
9,20,a30673083@gmail.com,Jose,Ramirez,2012-02-07 15:08:46,3436.0,2017-01-06 22:07:58.437857 -02:00,1,1


### 2.3. Course Enrollments

In [23]:
course_enrollments = analytics_tables_course_enrollments[['user_id', 'course_key']]
course_enrollments = course_enrollments.groupby('user_id').count()
course_enrollments.reset_index(level=0, inplace=True)
course_enrollments = course_enrollments.join(auth_user[['username', 'email']].set_index('username'), on='user_id')
course_enrollments.rename(columns={'course_key': 'free_course_enrollments'}, inplace=True)
course_enrollments = course_enrollments[['email', 'free_course_enrollments']]
course_enrollments.head()

Unnamed: 0,email,free_course_enrollments
0,deivissondine@gmail.com,2
1,,1
2,,1
3,anajuliapc@gmail.com,4
4,,1


In [24]:
x = x.join(course_enrollments.set_index('email'), on='email')
x['free_course_enrollments'] = x['free_course_enrollments'].fillna(0).astype(int)

### 2.4. Visits

In [25]:
visits = frontend_brazil_pages[['anonymous_id', 'id', 'received_at', 'category', 'path', 'referrer', 'context_user_agent']]

In [26]:
visits['anonymous_id'].nunique()

1557111

In [27]:
visits.tail().transpose()

Unnamed: 0,5747988,5747989,5747990,5747991,5747992
anonymous_id,1ea7ab15-0bec-42f8-8a15-51aa84c45c6d,4f179432-4139-4103-8ce8-8644d758d43e,c08bfbb5-1503-4e42-ae06-b46e49c4091b,de214a12-f0c3-45ac-91f2-3682a1b9ad33,6056337d-5fd9-4dcb-9afc-185a194f0dcb
id,ajs-e0d838d947190d2769bb993cb2d961a6,ajs-fa0df758867d00ff765bf44bf9ae0fc8,ajs-99e482818408348e6975526c0df64b4e,ajs-ae8b282af2279c8c48f9d04185965cb6,ajs-af92549d96fde294c755fd83cf309035
received_at,2017-05-03 18:20:56.138,2017-05-03 18:21:06.56,2017-05-03 18:21:23.606,2017-05-03 18:21:39.275,2017-05-03 18:21:47.901
category,Home,,Events,Events,Home
path,/,/course/deep-learning-nanodegree-foundation--n...,/events/details/,/events/details/,/
referrer,,https://br.udacity.com/nanodegree/,,https://outlook.live.com/,https://www.google.com.br/
context_user_agent,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4...,Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...


In [27]:
frontend_brazil_pages.head().transpose()

Unnamed: 0,0,1,2,3,4
id,ajs-db95663aa0b526d08914883781e0627e,ajs-23e34fc46487ade7adfd918ab4581cac,ajs-d94d32c86b9762b6cce09161b821fd5c,ajs-ef355b8706d4d19f4eb338698afedf85,ajs-1ac9964f59ec489ea01f8258be4379d2
received_at,2016-06-26 17:26:11.324,2016-06-26 17:38:04.679,2016-06-26 17:46:06.876,2016-06-26 17:46:18.139,2016-06-26 18:01:39.44
uuid,188,220,144,16,148
uuid_ts,2016-06-26 19:42:03,2016-06-26 19:42:03,2016-06-26 19:42:03,2016-06-26 19:42:03,2016-06-26 19:42:03
anonymous_id,cadb8625-9d0e-4fac-bdb8-a05c22584240,f93df52a-c68e-439a-9b71-3861e7ebb85f,2be62243-beb5-4399-b530-b3b1d204b5a4,2be62243-beb5-4399-b530-b3b1d204b5a4,b26f2036-6e4e-4c81-b20a-77ff61e39851
category,Home,Sign In,Home,Home,Catalog
context_ip,201.27.126.193,179.223.24.175,201.27.126.193,201.27.126.193,81.164.235.85
context_library_name,analytics.js,analytics.js,analytics.js,analytics.js,analytics.js
context_library_version,2.11.1,2.11.1,2.11.1,2.11.1,2.11.1
context_page_path,/,/account/auth/signin/,/,/,/courses/nanodegrees/


In [28]:
visits.head().transpose()

Unnamed: 0,0,1,2,3,4
anonymous_id,cadb8625-9d0e-4fac-bdb8-a05c22584240,f93df52a-c68e-439a-9b71-3861e7ebb85f,2be62243-beb5-4399-b530-b3b1d204b5a4,2be62243-beb5-4399-b530-b3b1d204b5a4,b26f2036-6e4e-4c81-b20a-77ff61e39851
id,ajs-db95663aa0b526d08914883781e0627e,ajs-23e34fc46487ade7adfd918ab4581cac,ajs-d94d32c86b9762b6cce09161b821fd5c,ajs-ef355b8706d4d19f4eb338698afedf85,ajs-1ac9964f59ec489ea01f8258be4379d2
received_at,2016-06-26 17:26:11.324,2016-06-26 17:38:04.679,2016-06-26 17:46:06.876,2016-06-26 17:46:18.139,2016-06-26 18:01:39.44
category,Home,Sign In,Home,Home,Catalog
path,/,/account/auth/signin/,/,/,/courses/nanodegrees/
referrer,,https://br.udacity.com/,,https://br.udacity.com/account/auth/logout/,https://br.udacity.com/courses/all/
context_user_agent,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...


#### 2.4.1. Visit classifiers based on page path

In [29]:
visits['is_home'] = (visits['path'] == '/').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [30]:
visits['is_ndop'] = (visits['path'].str.contains('--nd')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [31]:
visits['is_catalog_all'] = (visits['path'] == '/courses/all/').astype(int)
visits['is_catalog_nanodegrees'] = (visits['path'] == '/courses/nanodegrees/').astype(int)
visits['is_nanodegree_home'] = (visits['path'] == '/nanodegree/').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [32]:
visits['is_fcop_ud'] = (visits['path'].str.contains('--ud')).astype(int)
visits['is_fcop_cs'] = (visits['path'].str.contains('--cs')).astype(int)
visits['is_fcop_st'] = (visits['path'].str.contains('--st')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [33]:
visits['is_signin'] = (visits['path'].str.contains('/signin/')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [34]:
visits['is_event'] = (visits['path'].str.contains('/events/')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [35]:
visits['is_50back'] = (visits['path'] == '/nanodegree/50-back/').astype(int)
visits['is_tech_requirements'] = (visits['path'] == '/tech-requirements//').astype(int)
visits['is_contact'] = (visits['path'] == '/contact/').astype(int)
visits['is_us'] = (visits['path'] == '/us/').astype(int)
visits['is_jobs'] = (visits['path'] == '/jobs/').astype(int)
visits['is_legal'] = (visits['path'] == '/legal/').astype(int)
visits['is_hire_talent'] = (visits['path'] == '/hire-talent/').astype(int)
visits['is_business'] = (visits['path'] == '/business/').astype(int)
visits['is_success'] = (visits['path'] == '/success/').astype(int)
visits['is_payment'] = (visits['path'] == '/payment/').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [36]:
visits['is_android'] = (visits['path'].str.contains('/android/')).astype(int)
visits['is_ai'] = (visits['path'].str.contains('/ai/')).astype(int)
visits['is_drive'] = (visits['path'].str.contains('/drive/')).astype(int)
visits['is_robotics'] = (visits['path'].str.contains('/robotics/')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [37]:
visits['is_checkout'] = (visits['path'].str.contains('/checkout')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [38]:
visits['anonymous_id'].nunique()

1557111

In [39]:
def fix_is_ndop(row):
    if row['is_checkout'] == 1:
        return 0
    else:
        return row['is_ndop']

visits['is_ndop'] = visits.apply(fix_is_ndop, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### 2.4.2. Visit classifier based on mobile/desktop

In [40]:
import re

def is_mobile(row):
    if re.search('/Android|webOS|iPhone|iPad|iPod|BlackBerry|IEMobile|Opera Mini/', str(row['context_user_agent'])):
        return 1
    else:
        return 0

visits['is_mobile'] = visits.apply(is_mobile, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### 2.4.3. Visit classifiers based on date/time

In [41]:
def is_weekday(row):
    datetime_object = datetime.strptime(row['received_at'].split(".")[0], '%Y-%m-%d %H:%M:%S')
    return int(datetime_object.isoweekday() in range(1, 6))

visits['is_weekday'] = visits.apply(is_weekday, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### 2.4.4. Visit classifiers based on referrer

In [42]:
visits['referrer'] = visits['referrer'].fillna('')
visits['is_referrer_google'] = (visits['referrer'].str.contains('.google.')).astype(int)
visits['is_referrer_facebook'] = (visits['referrer'].str.contains('.facebook.')).astype(int)
visits['is_referrer_live'] = (visits['referrer'].str.contains('.live.')).astype(int)
visits['is_referrer_infomoney'] = (visits['referrer'].str.contains('.infomoney.')).astype(int)
visits['is_referrer_catracalivre'] = (visits['referrer'].str.contains('.catracalivre.')).astype(int)
visits['is_referrer_android'] = (visits['referrer'].str.contains('.android.')).astype(int)
visits['is_referrer_anhanguera'] = (visits['referrer'].str.contains('anhanguera.')).astype(int)
visits['is_referrer_linkedin'] = (visits['referrer'].str.contains('.linkedin.')).astype(int)
visits['is_referrer_instagram'] = (visits['referrer'].str.contains('.instagram.')).astype(int)
visits['is_referrer_cbsi'] = (visits['referrer'].str.contains('.cbsi.')).astype(int)
visits['is_referrer_tecmundo'] = (visits['referrer'].str.contains('.tecmundo.')).astype(int)
visits['is_referrer_bing'] = (visits['referrer'].str.contains('.bing.')).astype(int)
visits['is_referrer_computerworld'] = (visits['referrer'].str.contains('.computerworld.')).astype(int)
visits['is_referrer_github'] = (visits['referrer'].str.contains('.github.')).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [43]:
visits.head().transpose()

Unnamed: 0,0,1,2,3,4
anonymous_id,cadb8625-9d0e-4fac-bdb8-a05c22584240,f93df52a-c68e-439a-9b71-3861e7ebb85f,2be62243-beb5-4399-b530-b3b1d204b5a4,2be62243-beb5-4399-b530-b3b1d204b5a4,b26f2036-6e4e-4c81-b20a-77ff61e39851
id,ajs-db95663aa0b526d08914883781e0627e,ajs-23e34fc46487ade7adfd918ab4581cac,ajs-d94d32c86b9762b6cce09161b821fd5c,ajs-ef355b8706d4d19f4eb338698afedf85,ajs-1ac9964f59ec489ea01f8258be4379d2
received_at,2016-06-26 17:26:11.324,2016-06-26 17:38:04.679,2016-06-26 17:46:06.876,2016-06-26 17:46:18.139,2016-06-26 18:01:39.44
category,Home,Sign In,Home,Home,Catalog
path,/,/account/auth/signin/,/,/,/courses/nanodegrees/
referrer,,https://br.udacity.com/,,https://br.udacity.com/account/auth/logout/,https://br.udacity.com/courses/all/
context_user_agent,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
is_home,1,0,1,1,0
is_ndop,0,0,0,0,0
is_catalog_all,0,0,0,0,0


In [44]:
visits['anonymous_id'].nunique()

1557111

In [62]:
# 1. Join with anonymous_id's email
visits = visits.join(frontend_brazil_identifies[['anonymous_id', 'email']].set_index('anonymous_id'), on='anonymous_id')

# 2. Join with 'date_joined' and 'register_date'
visits['received_at'] = visits.apply(lambda row: datetime.strptime(row['received_at'].split(".")[0], '%Y-%m-%d %H:%M:%S'), axis=1)

visits = visits.join(x[['email', 'register_date']].set_index('email'), on='email')
visits['register_date'] = visits['register_date'].fillna(str(datetime.now()))
visits['register_date'] = visits.apply(lambda row: datetime.strptime(row['register_date'].split(".")[0], '%Y-%m-%d %H:%M:%S'), axis=1)

# 3. Cut data after 'register_date'
visits['is_pre_register'] = visits.apply(lambda row: 1 if row['received_at'] <= row['register_date'] else 0, axis=1)
visits = visits.loc[lambda df: df['is_pre_register'] == 1]

visits.drop('email', axis=1, inplace=True)
visits.drop('register_date', axis=1, inplace=True)
visits.drop('is_pre_register', axis=1, inplace=True)

visits.head()

Unnamed: 0,anonymous_id,id,received_at,category,path,referrer,context_user_agent,is_home,is_ndop,is_catalog_all,...,is_referrer_catracalivre,is_referrer_android,is_referrer_anhanguera,is_referrer_linkedin,is_referrer_instagram,is_referrer_cbsi,is_referrer_tecmundo,is_referrer_bing,is_referrer_computerworld,is_referrer_github
0,cadb8625-9d0e-4fac-bdb8-a05c22584240,ajs-db95663aa0b526d08914883781e0627e,2016-06-26 17:26:11,Home,/,,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,f93df52a-c68e-439a-9b71-3861e7ebb85f,ajs-23e34fc46487ade7adfd918ab4581cac,2016-06-26 17:38:04,Sign In,/account/auth/signin/,https://br.udacity.com/,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,f93df52a-c68e-439a-9b71-3861e7ebb85f,ajs-23e34fc46487ade7adfd918ab4581cac,2016-06-26 17:38:04,Sign In,/account/auth/signin/,https://br.udacity.com/,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,f93df52a-c68e-439a-9b71-3861e7ebb85f,ajs-23e34fc46487ade7adfd918ab4581cac,2016-06-26 17:38:04,Sign In,/account/auth/signin/,https://br.udacity.com/,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,f93df52a-c68e-439a-9b71-3861e7ebb85f,ajs-23e34fc46487ade7adfd918ab4581cac,2016-06-26 17:38:04,Sign In,/account/auth/signin/,https://br.udacity.com/,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
visits['anonymous_id'].nunique()

1544595

#### 2.4.5. Grouping visits data by anonymous_id

In [64]:
f = {
    'id': ['count'],
    'received_at': ['first'],
    'is_home': ['sum'],
    'is_ndop': ['sum'],
    'is_catalog_all': ['sum'],
    'is_catalog_nanodegrees': ['sum'],
    'is_nanodegree_home': ['sum'],
    'is_fcop_ud': ['sum'],
    'is_fcop_cs': ['sum'],
    'is_fcop_st': ['sum'],
    'is_signin': ['sum'],
    'is_event': ['sum'],
    'is_50back': ['sum'],
    'is_tech_requirements': ['sum'],
    'is_contact': ['sum'],
    'is_us': ['sum'],
    'is_jobs': ['sum'],
    'is_legal': ['sum'],
    'is_hire_talent': ['sum'],
    'is_business': ['sum'],
    'is_success': ['sum'],
    'is_payment': ['sum'],
    'is_android': ['sum'],
    'is_ai': ['sum'],
    'is_drive': ['sum'],
    'is_robotics': ['sum'],
    'is_checkout': ['sum'],
    'is_mobile': ['sum'],
    'is_weekday': ['sum'],
    'is_referrer_google': ['sum'],
    'is_referrer_facebook': ['sum'],
    'is_referrer_live': ['sum'],
    'is_referrer_infomoney': ['sum'],
    'is_referrer_catracalivre': ['sum'],
    'is_referrer_android': ['sum'],
    'is_referrer_anhanguera': ['sum'],
    'is_referrer_linkedin': ['sum'],
    'is_referrer_instagram': ['sum'],
    'is_referrer_cbsi': ['sum'],
    'is_referrer_tecmundo': ['sum'],
    'is_referrer_bing': ['sum'],
    'is_referrer_computerworld': ['sum'],
    'is_referrer_github': ['sum']
}

grouped = visits.sort_values('received_at').groupby('anonymous_id', as_index=False).agg(f)

In [65]:
grouped.columns = grouped.columns.droplevel(-1)

In [66]:
grouped.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
anonymous_id,00000410-387a-4e84-80b2-1d74100a40c4,000006d2-112b-4ff9-851e-abf573e423ca,000009e0-d6f9-47a4-98e8-8c1c63882a39,00001058-fe00-46b8-abbf-33df36741ff6,00001120-7828-4000-b375-779d41e18165,000016ca-7bb9-478e-a4bd-9e7388588b4c,00001ff4-6466-4103-95b1-6c89c5ac0475,00002646-ee3d-4dd2-a3a0-085fa2bdc25a,000032e6-a79a-40ef-9778-92912006f8ff,00003525-6d74-4b8e-9dcc-c535fefbc938
is_nanodegree_home,0,0,0,0,0,0,0,0,0,0
is_catalog_all,0,0,0,9,0,0,0,0,0,0
is_fcop_ud,0,1,0,5,0,0,0,0,0,0
is_referrer_cbsi,0,1,0,0,0,0,0,0,0,0
is_mobile,0,0,0,0,0,0,1,0,0,0
is_jobs,0,0,0,0,0,0,0,0,0,0
is_referrer_computerworld,0,0,0,0,0,0,0,0,0,0
is_home,0,0,0,0,0,3,1,4,0,0
is_referrer_android,0,0,0,1,0,0,0,0,0,0


In [67]:
anonymous_ids = frontend_brazil_identifies[['anonymous_id', 'email']]
anonymous_ids.head()

Unnamed: 0,anonymous_id,email
0,44199f26-92c9-42b0-be2f-48c46d697787,lfoyoshida+test50@gmail.com
1,0d3bbbd1-aafa-481d-82ad-d5b9aff2eadd,adrianowalmeida+3@gmail.com
2,2c0f22a0-bc8b-471f-8885-3b73bbde39bd,paula.mecatronica@gmail.com
3,c4bf5dac-b078-4a51-bb76-dbb043ef05eb,evandro.fonseca@madeiramadeira.com.br
4,f5ac388e-5e65-4a91-b3e8-ba701f13ad2e,veks@openmailbox.org


In [68]:
grouped = grouped.join(anonymous_ids.set_index('anonymous_id'), on='anonymous_id')

In [69]:
grouped.head()

Unnamed: 0,anonymous_id,is_nanodegree_home,is_catalog_all,is_fcop_ud,is_referrer_cbsi,is_mobile,is_jobs,is_referrer_computerworld,is_home,is_referrer_android,...,is_android,is_tech_requirements,is_contact,is_referrer_infomoney,is_ai,is_fcop_st,is_fcop_cs,is_referrer_tecmundo,is_event,email
0,00000410-387a-4e84-80b2-1d74100a40c4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1,000006d2-112b-4ff9-851e-abf573e423ca,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,000009e0-d6f9-47a4-98e8-8c1c63882a39,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,00001058-fe00-46b8-abbf-33df36741ff6,0,9,5,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,
4,00001120-7828-4000-b375-779d41e18165,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [70]:
grouped['email'] = grouped['email'].fillna('')
grouped = grouped.loc[lambda df: df['email'] != '']

In [71]:
grouped.head().transpose()

Unnamed: 0,10,38,41,48,101
anonymous_id,00003ac2-411b-41c4-a587-ded4d1dab6e7,00013304-5c18-4091-ab5f-445cbbe0034d,00015a78-47ef-45dc-8118-a13be7d35ab7,00018856-ccb7-4dc2-95b7-711856000499,00037ea0-ee64-4f24-9bb6-278e25778a35
is_nanodegree_home,1,4,3,2,1
is_catalog_all,1,5,0,0,0
is_fcop_ud,0,3,0,0,0
is_referrer_cbsi,0,0,0,0,0
is_mobile,0,0,0,0,0
is_jobs,0,0,0,0,0
is_referrer_computerworld,0,0,0,0,0
is_home,4,13,0,1,1
is_referrer_android,0,0,2,0,0


In [75]:
grouped['email'].nunique()

62432

#### 2.4.6. Grouping visits data by email

In [76]:
f = {
    'id': ['sum'],
    'received_at': ['first'],
    'is_home': ['sum'],
    'is_ndop': ['sum'],
    'is_catalog_all': ['sum'],
    'is_catalog_nanodegrees': ['sum'],
    'is_nanodegree_home': ['sum'],
    'is_fcop_ud': ['sum'],
    'is_fcop_cs': ['sum'],
    'is_fcop_st': ['sum'],
    'is_signin': ['sum'],
    'is_event': ['sum'],
    'is_50back': ['sum'],
    'is_tech_requirements': ['sum'],
    'is_contact': ['sum'],
    'is_us': ['sum'],
    'is_jobs': ['sum'],
    'is_legal': ['sum'],
    'is_hire_talent': ['sum'],
    'is_business': ['sum'],
    'is_success': ['sum'],
    'is_payment': ['sum'],
    'is_android': ['sum'],
    'is_ai': ['sum'],
    'is_drive': ['sum'],
    'is_robotics': ['sum'],
    'is_checkout': ['sum'],
    'is_mobile': ['sum'],
    'is_weekday': ['sum'],
    'is_referrer_google': ['sum'],
    'is_referrer_facebook': ['sum'],
    'is_referrer_live': ['sum'],
    'is_referrer_infomoney': ['sum'],
    'is_referrer_catracalivre': ['sum'],
    'is_referrer_android': ['sum'],
    'is_referrer_anhanguera': ['sum'],
    'is_referrer_linkedin': ['sum'],
    'is_referrer_instagram': ['sum'],
    'is_referrer_cbsi': ['sum'],
    'is_referrer_tecmundo': ['sum'],
    'is_referrer_bing': ['sum'],
    'is_referrer_computerworld': ['sum'],
    'is_referrer_github': ['sum']
}

grouped_visits = grouped.groupby('email', as_index=False).agg(f)
grouped_visits.columns = grouped_visits.columns.droplevel(-1)
grouped_visits.rename(columns={'id': 'count_visits'}, inplace=True)

In [81]:
grouped_visits.head().transpose()

Unnamed: 0,0,1,2,3,4
email,!Uúè§ÉvEÀ¶Ø=põÛ`m02·dìzúÇ¿ýá¥¦*ñ¿Nç-)$ë,+557588971838@gmail.com,0.adrian.axel@gmail.com,00001056631508sp@al.educacao.sp.gov.br,00col@bol.com.br
is_nanodegree_home,6,1,7,1,0
is_catalog_all,9,0,0,0,0
is_fcop_ud,18,0,0,0,0
is_referrer_cbsi,0,0,0,0,0
is_mobile,0,0,0,0,0
is_jobs,0,0,0,0,0
is_referrer_computerworld,0,0,0,0,0
is_home,0,0,4,2,0
is_referrer_android,0,2,0,0,0


In [79]:
anonymous_ids[['email', 'anonymous_id']].groupby('email', as_index=False).count().shape

(63677, 2)

In [88]:
grouped_visits.loc[lambda df: df['email'] == 'vinicius.zilinski@gmail.com'].transpose()

Unnamed: 0,59480
email,vinicius.zilinski@gmail.com
is_nanodegree_home,2
is_catalog_all,9
is_fcop_ud,2
is_referrer_cbsi,0
is_mobile,0
is_jobs,0
is_referrer_computerworld,0
is_home,14
is_referrer_android,0


#### 2.4.7. Finally, join group_visits and treat NaNs

In [80]:
x2 = x.join(grouped_visits.set_index('email'), on='email')
x.head()

Unnamed: 0,user_id,email,first_name,last_name,date_joined,subscription_id,register_date,is_paying_student,webinar_enrollments,free_course_enrollments
0,39,vinicius.zilinski@gmail.com,Vinicius,Zilinski,2016-06-21 19:32:27,,,0,0,4
1,6,leeo.apc@gmail.com,Leo,A.c,2013-12-30 01:16:23,,,0,1,20
2,8,sardinha.dias@gmail.com,Carlos,sardinha dias,2016-05-22 03:19:23,,,0,0,5
3,10,thiagonv@gmail.com,Thiago,Vilela,2012-10-22 01:19:56,,,0,0,12
4,11,ygorats@gmail.com,Ygor,Santos,2016-02-18 21:30:34,,,0,1,1


In [90]:
x2.rename(columns={'received_at': 'date_first_visit'}, inplace=True)
x2['subscription_id'] = x2['subscription_id'].fillna(0)
x2.head().transpose()

Unnamed: 0,0,1,2,3,4
user_id,39,6,8,10,11
email,vinicius.zilinski@gmail.com,leeo.apc@gmail.com,sardinha.dias@gmail.com,thiagonv@gmail.com,ygorats@gmail.com
first_name,Vinicius,Leo,Carlos,Thiago,Ygor
last_name,Zilinski,A.c,sardinha dias,Vilela,Santos
date_joined,2016-06-21 19:32:27,2013-12-30 01:16:23,2016-05-22 03:19:23,2012-10-22 01:19:56,2016-02-18 21:30:34
subscription_id,0,0,0,0,0
register_date,,,,,
is_paying_student,0,0,0,0,0
webinar_enrollments,0,1,0,0,1
free_course_enrollments,4,20,5,12,1


In [92]:
x2.columns.values

array(['user_id', 'email', 'first_name', 'last_name', 'date_joined',
       'subscription_id', 'register_date', 'is_paying_student',
       'webinar_enrollments', 'free_course_enrollments',
       'is_nanodegree_home', 'is_catalog_all', 'is_fcop_ud',
       'is_referrer_cbsi', 'is_mobile', 'is_jobs',
       'is_referrer_computerworld', 'is_home', 'is_referrer_android',
       'is_business', 'is_us', 'is_legal', 'is_referrer_google',
       'is_50back', 'is_payment', 'is_catalog_nanodegrees', 'is_drive',
       'is_success', 'is_referrer_linkedin', 'is_referrer_instagram',
       'is_checkout', 'is_referrer_facebook', 'is_referrer_bing',
       'date_first_visit', 'is_referrer_anhanguera', 'is_ndop',
       'is_referrer_github', 'is_weekday', 'is_referrer_catracalivre',
       'is_robotics', 'is_signin', 'is_hire_talent', 'is_referrer_live',
       'count_visits', 'is_android', 'is_tech_requirements', 'is_contact',
       'is_referrer_infomoney', 'is_ai', 'is_fcop_st', 'is_fcop_cs',
   

In [93]:
final_x = x2[['email', 'first_name', 'last_name', 'date_joined', 'date_first_visit', 'count_visits', 
             'webinar_enrollments', 'free_course_enrollments', 'is_home',
       'is_50back', 'is_signin', 'is_business', 'is_success',
       'is_referrer_instagram', 'is_referrer_android',
       'is_referrer_github', 'is_drive', 'is_jobs',
       'is_referrer_computerworld', 'is_us', 'is_referrer_catracalivre',
       'is_weekday', 'is_nanodegree_home', 'is_fcop_ud', 'is_hire_talent',
       'is_catalog_nanodegrees', 'is_mobile', 'is_ai', 'is_legal',
       'is_checkout', 'is_contact', 'is_referrer_live',
       'is_referrer_linkedin', 'is_referrer_google',
       'is_referrer_anhanguera', 'is_referrer_infomoney',
       'is_referrer_cbsi', 'is_catalog_all', 'is_robotics', 'is_event',
       'is_referrer_bing', 'is_payment', 'is_tech_requirements',
       'is_android', 'is_ndop', 'is_referrer_facebook', 'is_fcop_st',
       'is_referrer_tecmundo', 'is_fcop_cs', 'is_paying_student']]
final_x.head().transpose()

Unnamed: 0,0,1,2,3,4
email,vinicius.zilinski@gmail.com,leeo.apc@gmail.com,sardinha.dias@gmail.com,thiagonv@gmail.com,ygorats@gmail.com
first_name,Vinicius,Leo,Carlos,Thiago,Ygor
last_name,Zilinski,A.c,sardinha dias,Vilela,Santos
date_joined,2016-06-21 19:32:27,2013-12-30 01:16:23,2016-05-22 03:19:23,2012-10-22 01:19:56,2016-02-18 21:30:34
date_first_visit,2016-07-03 18:46:44,2017-03-17 22:59:53,2016-07-03 02:35:36,2016-07-21 17:36:04,2016-11-16 10:23:23
count_visits,40,1458,40,25,1
webinar_enrollments,0,1,0,0,1
free_course_enrollments,4,20,5,12,1
is_home,14,456,8,6,0
is_50back,0,0,0,0,0


In [95]:
### RESOLVER PROBLEMA ACIMA: RECOMECAR, MAS FAZER O SORTING DA TABELA VISITS NA LEITURA DO BANCO

71258

In [97]:
final_x.to_csv('features.csv')

In [96]:
final_x.loc[lambda df: ~df['date_first_visit'].isnull()].shape

(60528, 50)