Introduction
At its core, churn prediction is a classification problem, where the classes often are ‘churned’ and ‘active’. The prediction is based on historical data, including customer behavior, demographics, transaction history, and more.

Goal: well-balanced classification model.

Instructions:
Work in Python or R.
Examine the data.
While working through the layers of challenge, please leave comments in your code.
Share your solution in a notebook format with us.

Deliverables:

.ipynb or .R

Rules:
We understand your time is precious and would not want you to spend more than 6 to 8 hours on this over the span of one week max.


In [2]:
import pandas as pd
import numpy as np
import os

# Loading and displaying dataset

In [3]:
churn_df = pd.read_csv("../data/task_data_churned.csv")

In [14]:
pd.set_option('display.max_columns', None)

# Understanding columns


In [13]:
churn_df.columns

Index(['ws_users_activated', 'ws_users_deactivated', 'ws_users_invited',
       'action_create_project', 'action_export_report',
       'action_api_and_webhooks', 'action_time_entries_via_tracker',
       'action_start_trial', 'action_import_csv', 'action_create_invoice',
       'action_lock_entries', 'action_add_targets',
       'action_connect_quickbooks', 'action_create_expense',
       'action_project_budget', 'action_gps_tracking', 'action_screenshots',
       'action_create_custom_field', 'country', 'value_days_to_purchase',
       'value_number_of_active_months', 'value_transactions_number',
       'value_regular_seats', 'value_kiosk_seats', 'revenue',
       'churned_status'],
      dtype='object')

In [20]:
churn_df.dtypes

ws_users_activated                   int64
ws_users_deactivated                 int64
ws_users_invited                     int64
action_create_project                int64
action_export_report                 int64
action_api_and_webhooks              int64
action_time_entries_via_tracker      int64
action_start_trial                   int64
action_import_csv                    int64
action_create_invoice                int64
action_lock_entries                  int64
action_add_targets                   int64
action_connect_quickbooks            int64
action_create_expense                int64
action_project_budget                int64
action_gps_tracking                float64
action_screenshots                 float64
action_create_custom_field         float64
country                             object
value_days_to_purchase               int64
value_number_of_active_months        int64
value_transactions_number            int64
value_regular_seats                  int64
value_kiosk

In [21]:
churn_df.head()

Unnamed: 0,ws_users_activated,ws_users_deactivated,ws_users_invited,action_create_project,action_export_report,action_api_and_webhooks,action_time_entries_via_tracker,action_start_trial,action_import_csv,action_create_invoice,action_lock_entries,action_add_targets,action_connect_quickbooks,action_create_expense,action_project_budget,action_gps_tracking,action_screenshots,action_create_custom_field,country,value_days_to_purchase,value_number_of_active_months,value_transactions_number,value_regular_seats,value_kiosk_seats,revenue,churned_status
0,3,2,0,5,8,0,0,0,0,0,0,0,0,0,0,,,,Canada,2,0,6,3,0,184.925,No
1,6,1,0,35,106,0,33,0,1,0,5,8,0,0,3,,,3.0,United Kingdom,37,9,9,6,0,608.842,No
2,2,0,0,3,3,0,0,0,0,0,10,2,1,0,9,,1.0,,Florida,98,3,12,3,0,395.122,No
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,,Kentucky,0,1,2,1,0,25.974,Yes
4,1,0,0,0,0,1,0,0,0,0,0,0,2,0,0,,,,Ireland,21,2,3,1,0,38.961,Yes


In [17]:
churn_df.describe()

Unnamed: 0,ws_users_activated,ws_users_deactivated,ws_users_invited,action_create_project,action_export_report,action_api_and_webhooks,action_time_entries_via_tracker,action_start_trial,action_import_csv,action_create_invoice,action_lock_entries,action_add_targets,action_connect_quickbooks,action_create_expense,action_project_budget,action_gps_tracking,action_screenshots,action_create_custom_field,value_days_to_purchase,value_number_of_active_months,value_transactions_number,value_regular_seats,value_kiosk_seats,revenue
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,876.0,1044.0,443.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,5.619504,0.827738,0.158273,28.043965,22.709432,0.383293,19.479616,0.175859,0.622702,8.494005,1.634293,0.290568,0.081934,10.019185,10.459233,1.371005,1.417625,7.24605,61.286571,4.215827,5.728617,6.067946,0.257794,378.331825
std,11.36413,3.527056,0.784527,80.761092,80.884964,3.089846,114.85605,0.380777,4.770705,52.699928,7.180274,1.319093,0.688108,72.849346,37.851112,0.726969,0.791806,11.577418,85.179584,3.691711,4.893211,11.766325,2.95797,1007.971191
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,0.0,38.961
50%,2.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,24.0,3.0,4.0,2.0,0.0,105.7615
75%,6.0,0.0,0.0,26.0,15.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,2.0,2.0,7.0,84.75,7.0,8.0,6.0,0.0,333.45975
max,206.0,73.0,20.0,1923.0,1740.0,127.0,3382.0,1.0,120.0,1405.0,152.0,30.0,27.0,1785.0,829.0,8.0,11.0,106.0,420.0,14.0,90.0,215.0,117.0,27235.156


In [18]:
churn_df['churned_status'].value_counts()

churned_status
No     1703
Yes     799
Name: count, dtype: int64

In [19]:
missing_values = churn_df.isnull().sum()
print(missing_values)

ws_users_activated                    0
ws_users_deactivated                  0
ws_users_invited                      0
action_create_project                 0
action_export_report                  0
action_api_and_webhooks               0
action_time_entries_via_tracker       0
action_start_trial                    0
action_import_csv                     0
action_create_invoice                 0
action_lock_entries                   0
action_add_targets                    0
action_connect_quickbooks             0
action_create_expense                 0
action_project_budget                 0
action_gps_tracking                1626
action_screenshots                 1458
action_create_custom_field         2059
country                              84
value_days_to_purchase                0
value_number_of_active_months         0
value_transactions_number             0
value_regular_seats                   0
value_kiosk_seats                     0
revenue                               0


As we can see here, a lot of people didn't want their GPS to be tracked, and they didn't add screenshots. So we can discard those columns entirely as they are empty on more thatn 50% of the ocassions. Also, we could exclude records that don't have a country, because they make a really small sample.

In [22]:
columns_to_exclude = ['action_gps_tracking', 'action_screenshots', 'action_create_custom_field']
churn_df = churn_df.drop(columns=columns_to_exclude)

In [23]:
churn_df = churn_df[churn_df['country'].notnull()]

In [24]:
missing_values = churn_df.isnull().sum()
print(missing_values)

ws_users_activated                 0
ws_users_deactivated               0
ws_users_invited                   0
action_create_project              0
action_export_report               0
action_api_and_webhooks            0
action_time_entries_via_tracker    0
action_start_trial                 0
action_import_csv                  0
action_create_invoice              0
action_lock_entries                0
action_add_targets                 0
action_connect_quickbooks          0
action_create_expense              0
action_project_budget              0
country                            0
value_days_to_purchase             0
value_number_of_active_months      0
value_transactions_number          0
value_regular_seats                0
value_kiosk_seats                  0
revenue                            0
churned_status                     0
dtype: int64


In [25]:
churn_df.dtypes

ws_users_activated                   int64
ws_users_deactivated                 int64
ws_users_invited                     int64
action_create_project                int64
action_export_report                 int64
action_api_and_webhooks              int64
action_time_entries_via_tracker      int64
action_start_trial                   int64
action_import_csv                    int64
action_create_invoice                int64
action_lock_entries                  int64
action_add_targets                   int64
action_connect_quickbooks            int64
action_create_expense                int64
action_project_budget                int64
country                             object
value_days_to_purchase               int64
value_number_of_active_months        int64
value_transactions_number            int64
value_regular_seats                  int64
value_kiosk_seats                    int64
revenue                            float64
churned_status                      object
dtype: obje

Now the data doesn't have any missing values. 

Every column is now numeric, except the country. So let's see how the countries are distributed.
 

In [38]:
print(f"Unique countries:{churn_df['country'].unique()}")
print(f"Number of unique countries:{len(churn_df['country'].unique())}")


Unique countries:['Canada' 'United Kingdom' 'Florida' 'Kentucky' 'Ireland' 'Philippines'
 'Albania' 'South Africa' 'Latvia' 'Connecticut' 'Kyrgyzstan' 'California'
 'Maryland' 'Slovakia' 'Indonesia' 'Illinois' 'Alabama' 'France' 'Israel'
 'Germany' 'Angola' 'Ukraine' 'Belarus' 'Netherlands' 'Australia' 'Chile'
 'New Zealand' 'Switzerland' 'Singapore' 'Virginia' 'Ohio' 'New Jersey'
 'Seychelles' 'Puerto Rico' 'Arizona' 'North Carolina' 'Texas' 'Brazil'
 'Belgium' 'Barbados' 'Pennsylvania' 'Mexico' 'New York' 'Oregon'
 'Bangladesh' 'Massachusetts' 'Indiana' 'Washington' 'Hungary' 'Finland'
 'Michigan' 'Malaysia' 'India' 'Arkansas' 'Suriname' 'Croatia' 'Colorado'
 'Mississippi' 'Italy' 'Argentina' 'Dominican Republic' 'Spain'
 'Lithuania' 'Delaware' 'Tennessee' 'Austria' 'Bahamas' 'Armenia'
 'Cayman Islands' 'Poland' 'Costa Rica' 'Czechia' 'District of Columbia'
 'Jamaica' 'Georgia' 'Lebanon' 'Romania' 'Serbia' 'French Polynesia'
 'China' 'Alaska' 'Norway' 'Louisiana' 'Turkey' 'Hawaii' 'M

In [27]:
# Get the value counts of the 'country' column
country_counts = churn_df['country'].value_counts()

# Display the value counts
print("Value counts of 'country' column:")
print(country_counts)

Value counts of 'country' column:
country
California        247
Australia         156
Canada            146
Florida           125
United Kingdom    112
                 ... 
Qatar               1
Lebanon             1
Sri Lanka           1
Namibia             1
Belize              1
Name: count, Length: 155, dtype: int64


In [36]:
# Calculate the quantiles of the 'country' column
country_quantiles = churn_df['country'].value_counts().quantile([0.25, 0.50, 0.75])


# Display the quantiles
print("\nQuantiles of 'country' column:")
print(country_quantiles)


Quantiles of 'country' column:
0.25     2.0
0.50     6.0
0.75    13.5
Name: count, dtype: float64
