In [3]:
from datalearn19intro import (get_accounts, get_events, get_subscriptions, get_users)

## Data manipulation
import pandas as pd
import numpy as np

## Modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Upload Data

In [4]:
# acc = pd.read_csv("Data/accounts.csv")
acc = get_accounts()
events = get_events()
subs = get_subscriptions()
users = get_users()

In [5]:
# change column names
acc.columns = 'acc_' + acc.columns 
events.columns = 'events_' + events.columns
subs.columns = 'subs_' + subs.columns
users.columns = 'users_' + users.columns

In [6]:
data = acc.copy()

acc_users = pd.merge(acc, users, left_on = 'acc_account_id', right_on = 'users_account_id', how = 'left')
acc_users = acc_users.loc[:, acc_users.columns != 'users_account_id']

acc_users_grouped = acc_users.groupby(['acc_account_id']).users_user_id.nunique().to_frame('unique_users')
acc_users_grouped['unique_cities_of_users'] = acc_users.groupby(['acc_account_id']).users_city.nunique().to_frame('unique_cities_of_users')
## add more aggregations
acc_users_grouped = acc_users_grouped.reset_index()

data = pd.merge(data, acc_users_grouped, 
               on = 'acc_account_id', how = 'left')

In [7]:
data

Unnamed: 0,acc_account_id,acc_marketing_source,acc_marketing_referrer,acc_created_at,acc_plan_id,acc_trial_start,acc_started_plan_at,acc_signup_box_origin,acc_churn_state,acc_churn_date,...,acc_has_domain,acc_mrr,acc_lead_score,acc_industry.1,acc_team_size,acc_user_goal,acc_user_description,acc_sub_industry,unique_users,unique_cities_of_users
0,2793496,bing,https://www.bing.com/search?q=basecamp login,2019-01-01,,2019-01-01,,,none,,...,0,,0,,,,,,1,1
1,2793497,,,2019-01-01,,2019-01-01,,mobile_app,none,,...,0,,0,,,,,,2,2
2,2793498,adwordsverticals,https://www.google.com/,2019-01-01,,2019-01-01,,,none,,...,0,,0,Other,1,,,,1,1
3,2793499,,,2019-01-01,,2019-01-01,,mobile_app,none,,...,0,,0,,,,,,1,1
4,2793500,adwordsyoutube,https://www.youtube.com/,2019-01-01,,2019-04-04,,,none,,...,0,,0,Design,1,,,,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,2794492,facebook,http://m.facebook.com,2019-01-01,,2019-01-01,,,none,,...,1,,0,,,,,,1,0
997,2794493,instagram,http://instagram.com/,2019-01-01,,2019-01-01,,,none,,...,0,,0,,,,,,1,1
998,2794494,,https://monday.com/,2019-01-01,,2019-01-01,,,none,,...,1,,0,Education,1,,,,1,1
999,2794495,instagram,,2019-01-01,,2019-01-01,,,none,,...,1,,0,,,,,,4,1


# Train-test split 

In [13]:
data.select_dtypes(include='float')

Unnamed: 0,acc_plan_id,acc_churn_reason,acc_time_diff,acc_free_users,acc_company_size,acc_last_upgrade_promotion_start,acc_cs_agent,acc_sales_agent,acc_disabled,acc_max_team_size,acc_min_team_size,acc_cancellation_on_renewal_date,acc_partner_id,acc_mrr,acc_user_goal,acc_user_description,acc_sub_industry
0,,,11.0,,,,,,,5.0,2.0,,,,,,
1,,,,,,,,,,5.0,2.0,,,,,,
2,,,-6.0,,,,,,,1.0,1.0,,,,,,
3,,,,,,,,,,,,,,,,,
4,,,-5.0,,,,,,,1.0,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,,,1.0,,5000.0,,,,,0.0,0.0,,200.0,,,,
997,,,-3.0,,,,,,,,,,,,,,
998,,,1.0,,670.0,,,,,1.0,1.0,,,,,,
999,,,,,15.0,,,,,10.0,6.0,,,,,,


In [10]:
y = data[['acc_lead_score']]
X = data.loc[:,(data.columns != 'acc_lead_score') & (data.columns != 'acc_account_id')]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [12]:
X_train.head()

Unnamed: 0,acc_marketing_source,acc_marketing_referrer,acc_created_at,acc_plan_id,acc_trial_start,acc_started_plan_at,acc_signup_box_origin,acc_churn_state,acc_churn_date,acc_churn_reason,...,acc_pricing_version,acc_has_domain,acc_mrr,acc_industry.1,acc_team_size,acc_user_goal,acc_user_description,acc_sub_industry,unique_users,unique_cities_of_users
250,capterra,https%3a%2f%2fwww.capterra.com%2f,2019-01-01,,2019-01-01,,,none,,,...,3,0,,Project Management,2-5,,,,1,1
940,facebook,http://m.facebook.com,2019-01-01,,2019-01-01,,,none,,,...,3,0,,,,,,,1,1
47,,https%3a%2f%2fmonday.com%2flang%2fes%2f,2019-01-01,,2019-01-01,,,none,,,...,3,1,,,,,,,1,1
969,,https://monday.com/,2019-01-01,,2019-01-14,,,none,,,...,3,1,,Marketing,6-10,,,,15,2
801,adwordssearch,https://www.google.com/,2019-01-01,,2019-01-01,,,none,,,...,3,1,,Consulting,6-10,,,,1,1
