In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# keep matplotlib interactive
%matplotlib notebook
# %matplotlib inline

# use ggplot style
plt.style.use('ggplot')

# Preface
Prior to building a model, we first need to make sure categorical data are encoded then we need to split our data into a train and test set. Because of class imbalance we will need to use upsampling (SMOTE or ADASYN) or downsampling.


### Activity log features

In [2]:
# activity data
act_pickle = ['song_length_full.p34', 'song_total_full.p34', 'song_unique_full.p34']

In [3]:
# import our activity features
activity_list = []

for act in act_pickle:
    proc_dir = os.path.join(os.pardir, 'data', 'processed', act)
    df_proc = pd.read_pickle(proc_dir)
    
    # remove data from February 2017 (last month)
    last_month = df_proc.columns.get_level_values(level = 1).unique()[-1]
    df_proc = df_proc.drop(labels=last_month, axis = 1, level = 1)
    
    # collect activity dataframe
    activity_list.append(df_proc)

### Transaction features

In [4]:
# import our transaction features
trans_proc_dir = os.path.join(os.pardir, 'data', 'processed', 'transactions_February2017.p34')
df_trans = pd.read_pickle(trans_proc_dir)

In [5]:
# change pay_id to integer from boolean
df_trans.pay_id = df_trans.pay_id.astype('int64')

Encode payment ID and plan duration. Plan duration is an object type thus it will be encoded. Payment ID was an integer so we had to convert it to categorical.

In [6]:
# one-hot encoding scheme for plan duration
df_trans = pd.get_dummies(df_trans, drop_first=True)

In [7]:
activity_list.append(df_trans)

### Prepare input matrice for model building

In [8]:
# add target values
# eligible users are provided in a csv file
train_dir = os.path.join(os.pardir, 'data', 'raw', 'train.csv')
s_users = pd.read_csv(train_dir, index_col='msno', dtype = {'is_churn' : np.bool})

In [9]:
activity_list.append(s_users)

In [10]:
# concatenate all features
df_full = pd.concat(activity_list, axis = 1)

In [11]:
# some eligible users do not have any activity features. That will create Nan values after concatenation.
# Because all eligible users are included in transaction features
df_full = df_full.fillna(value=0)

In [12]:
# rename columns with tuple from multiIndex
col_names = df_full.columns
name_formatted = [ '_'.join([col[0], col[1].strftime('%b_%Y')]) if isinstance(col, tuple) else col for col in col_names ]
df_full.columns = name_formatted

In [13]:
df_full.head()

Unnamed: 0,num_25_Sep_2016,num_25_Oct_2016,num_25_Nov_2016,num_25_Dec_2016,num_25_Jan_2017,num_50_Sep_2016,num_50_Oct_2016,num_50_Nov_2016,num_50_Dec_2016,num_50_Jan_2017,...,auto_renew_ratio,uninterrupted_days,pay_id,plan_duration_180 - 269,plan_duration_270 - 364,plan_duration_30 - 89,plan_duration_365 - 485,plan_duration_8 - 29,plan_duration_90 - 179,is_churn
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0.0,0.0,4.266667,6.225806,4.354839,0.0,0.0,2.366667,2.83871,2.225806,...,1.0,91,1,0,0,1,0,0,0,False
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1.766667,1.032258,1.966667,1.387097,3.483871,0.933333,0.387097,0.766667,0.806452,1.16129,...,1.0,181,0,0,0,1,0,0,0,False
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,5.466667,6.096774,3.666667,5.354839,9.322581,1.966667,2.709677,1.4,2.354839,5.645161,...,1.0,762,1,0,0,1,0,0,0,False
++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,337,1,0,0,1,0,0,0,False
++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,181,0,0,0,1,0,0,0,False


In [14]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 992931 entries, +++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o= to zzzN9thH22os1dRS0VHReY/8FTfGHOi86//d+wGGFsQ=
Data columns (total 68 columns):
num_25_Sep_2016            992931 non-null float64
num_25_Oct_2016            992931 non-null float64
num_25_Nov_2016            992931 non-null float64
num_25_Dec_2016            992931 non-null float64
num_25_Jan_2017            992931 non-null float64
num_50_Sep_2016            992931 non-null float64
num_50_Oct_2016            992931 non-null float64
num_50_Nov_2016            992931 non-null float64
num_50_Dec_2016            992931 non-null float64
num_50_Jan_2017            992931 non-null float64
num_75_Sep_2016            992931 non-null float64
num_75_Oct_2016            992931 non-null float64
num_75_Nov_2016            992931 non-null float64
num_75_Dec_2016            992931 non-null float64
num_75_Jan_2017            992931 non-null float64
num_985_Sep_2016           992931 non-null float

# Train-Test split

In [15]:
from sklearn.model_selection import train_test_split

# get target values
y = df_full.is_churn.values
X = df_full.drop(labels='is_churn', axis = 1).values

# keep test set
X_train, X_test, y_train, y_test = \
            train_test_split( X , y, test_size = 0.1, random_state = 7, stratify = y)

In [16]:
print(X_train.shape)
print(X_test.shape)

(893637, 67)
(99294, 67)


# Logistic regression with regularization

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve

In [None]:
log_clf = imbPipeline([('scaler', MinMaxScaler()),
                    ('imb', SMOTE(random_state=7)),
                    ('clf', LogisticRegression()),
                     ])

# use regular SMOTE, we only need k_neighbors and ratio
parameters = {'imb__ratio': ['minority'],
              'imb__k_neighbors': [5, 10, 15],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [0.1, 0.3, 1, 3],
             }

# set up grid search using all of my cores
gs_clf = GridSearchCV(log_clf, param_grid = parameters, cv = 5 , scoring = 'f1', n_jobs = -1, verbose = 1)

In [None]:
# perform grid search CV on pipeline
_ = gs_clf.fit(X_train, y_train)

In [None]:
# training set score
print('Train score = ', gs_clf.score(X_train, y_train))
# test set score
print('Test score = ', gs_clf.score(X_test, y_test))

In [None]:
# 
y_pred = gs_clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y_pred_prob = gs_clf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

figimp , axroc = plt.subplots(figsize = (8, 5))
axroc.plot([0, 1], [0, 1], 'k--')
axroc.plot(fpr, tpr, label='Logistic Regression')
_ = plt.xlabel('False Positive Rate')
_ = plt.ylabel('True Positive Rate')
_ = plt.title('Logistic Regression ROC Curve')

In [None]:
# get column names except target
names = df_full.drop(labels='is_churn', axis = 1).columns
# get coefficient of logistic regression model
l1_coef = gs_clf.best_estimator_.named_steps['clf'].coef_

# make plot
figimp , aximp = plt.subplots(figsize = (8, 5))
_ = aximp.plot(range(len(names)), l1_coef.reshape(-1,1))
_ = plt.xticks(range(len(names)), names, rotation=60)
_ = plt.ylabel('Coefficients')
plt.tight_layout()

print('Most important feature =', names[l1_coef.argmax()])