In [194]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [195]:
train = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")

train.shape, test.shape

((17499636, 19), (4393179, 19))

In [196]:
train.head()

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
0,200,M,Shlok,paid,Johnson,1749042,1538352001000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21
992,200,M,Shlok,paid,Johnson,1749042,1538352525000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21
1360,200,M,Shlok,paid,Johnson,1749042,1538352703000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21
1825,200,M,Shlok,paid,Johnson,1749042,1538352935000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21
2366,200,M,Shlok,paid,Johnson,1749042,1538353200000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21


In [213]:
train['page'].unique()

array(['NextSong', 'Downgrade', 'Help', 'Home', 'Thumbs Up', 'Add Friend',
       'Thumbs Down', 'Add to Playlist', 'Logout', 'About', 'Settings',
       'Save Settings', 'Cancel', 'Cancellation Confirmation',
       'Submit Downgrade', 'Roll Advert', 'Upgrade', 'Error',
       'Submit Upgrade'], dtype=object)

In [197]:
train.columns

Index(['status', 'gender', 'firstName', 'level', 'lastName', 'userId', 'ts',
       'auth', 'page', 'sessionId', 'location', 'itemInSession', 'userAgent',
       'method', 'length', 'song', 'artist', 'time', 'registration'],
      dtype='object')

In [198]:
train.dtypes

status                    int64
gender                   object
firstName                object
level                    object
lastName                 object
userId                   object
ts                        int64
auth                     object
page                     object
sessionId                 int64
location                 object
itemInSession             int64
userAgent                object
method                   object
length                  float64
song                     object
artist                   object
time             datetime64[us]
registration     datetime64[us]
dtype: object

In [199]:
train['time'].min(), train['time'].max()

(Timestamp('2018-10-01 00:00:01'), Timestamp('2018-11-20 00:00:00'))

**Conceptual Outline**:

Our key columns are userId, time and page. We want features that describe how the user behaved before the prediction point, not after, to prevent leakage from the future. 

Thus, we define a prediction time for each user. We want to predict churn in the next 10 days, thus we need a moment in time $T_\text{pred}$ at which we pretend to make the prediction. 

Then, there are two options. 1) The user churns (has a "cancellation confirmation" event), then we set $T_\text{pred}$ 10 days before $T_\text{churn}$. 2) The user does not churn in the training set, then we set the prediction time 10 days before the last observed event. $T_\text{pred} = T_\text{pred} - 10 \text{days}$.

Everythin we compute as features must only ever use events with time before $T_\text{pred}$, otherwise we have data leakage.

**Feature 1**:

I propose "recency of activity" , i.e. days since last event, as a feature predicting churn. Users who have not done anything for a long time are more likely to churn. For each user, we look at all events with timestamp before $T_\text{pred}$ and find the last event before $T_\text{pred}$.

A possible interpretation would be: Small value: The user was active right before the prediction point and has lower churn risk. Large value: The user was inactive for many days, higher churn risk.

**Feature 2:**

I propose short-term activity level as a predictor of churn. 

We define a seven day lookback window $[T_\text{pred} - 7 \text{days}, T_\text{pred}]$. Then, we count all events in this window and construct the feature "events_last_7d". 

Rationale: Many events in the last seven days before $T_\text{pred}$ indicates that the user likes the service, uses it regularly and thus is unlikely to churn. If the user barely uses the servide, he is more likely to cancel the subscription.

**Feature 2.1:**

A very similar feature with a seven days lookback window can be included with the sole difference that we only consider songs played as events in the window. In the end, users use the service to listen to songs and thus the number of songs played in the last seven days before prediction_ts might be a better indicator of churn inclination than the total amount of events. 

In [200]:
df = train.copy()

In [230]:
# create feature for observation window
def create_features(df, observation_start, observation_end):
    # Filter data for the observation window
    obs_mask = (df['time'] >= observation_start) & (df['time'] < observation_end)
    df_obs = df[obs_mask]
    
    # Initialize feature DataFrame
    user_ids = df_obs['userId'].unique()
    features = pd.DataFrame({'userId': user_ids})
    
    # Feature 1: Total events in observation window
    total_events = df_obs.groupby('userId').size().reset_index(name='total_events')
    features = features.merge(total_events, on='userId', how='left')
    
    # Feature 2: number of songs played
    songs_played = df_obs[df_obs['page'] == 'NextSong'].groupby('userId').size().reset_index(name='songs_played')
    features = features.merge(songs_played, on='userId', how='left')

    # Feature 3: Number of ads seen
    ads_seen = df_obs[df_obs['page'] == 'Roll Advert'].groupby('userId').size().reset_index(name='ads_seen')
    features = features.merge(ads_seen, on='userId', how='left')

    # Feature 4: Total duration of songs played
    song_duration = df_obs[df_obs['page'] == 'NextSong'].groupby('userId')['length'].sum().reset_index(name='total_song_duration')
    features = features.merge(song_duration, on='userId', how='left')
    
    # Fill NaN values with 0
    features.fillna(0, inplace=True)
    
    return features

In [229]:
# create one function for feature engineering
def feature_engineering(df):
    # global timestamps
    min_ts = df['time'].min()
    max_ts = df['time'].max()
    # define time deltas
    delta_7d = pd.Timedelta(days=7)
    delta_10d = pd.Timedelta(days=10)
    delta_14d = pd.Timedelta(days=14)

    # 14 DAYS WINDOW
    # first estimation window: observe 14 days after min_ts, predict churn in next 10 days
    observation_start = min_ts
    observation_end = observation_start + delta_14d
    prediction_end = observation_end + delta_10d

    # create mask for observation and prediction windows
    obs_mask = (df['time'] >= observation_start) & (df['time'] < observation_end)
    pred_mask = (df['time'] >= observation_end) & (df['time'] < prediction_end)

    # check whether churn occurred in prediction window
    churn_in_pred_window = df[pred_mask & (df['page'] == 'Cancellation Confirmation')]

    # create subset of users present in observation window
    users_in_obs_window = df[obs_mask]['userId'].unique()
    df = df[df['userId'].isin(users_in_obs_window)]

    # df contains users active in observation window
    # restrict dataset to prediction window
    df_pred_window = df[pred_mask]

    # add churn flag per user
    churned_users = churn_in_pred_window['userId'].unique()

    # user level table: userId and binary churn flag
    user_data = pd.DataFrame({'userId': users_in_obs_window})
    user_data['churned'] = np.where(
        user_data['userId'].isin(churned_users), 1, 0
    )

    # create features
    features_1 = create_features(df, observation_start, observation_end)

    # 7 DAYS WINDOW
    observation_start = min_ts
    observation_end = observation_start + delta_7d
    features_2 = create_features(df, observation_start, observation_end)

    # merge features to user_data
    user_data = user_data.merge(features_1, on='userId', how='left', suffixes=('', '_14d'))
    user_data = user_data.merge(features_2, on='userId', how='left', suffixes=('', '_7d'))

    # create ratio features: 14d / 7d
    user_data['event_ratio_14d_7d'] = user_data['total_events'] / (user_data['total_events_7d'] + 1) # avoid division by zero
    user_data['song_ratio_14d_7d'] = user_data['songs_played'] / (user_data['songs_played_7d'] + 1) # avoid division by zero

    # create difference features: 14d - 7d for song_duration
    user_data['song_duration_diff_14d_7d'] = user_data['total_song_duration'] - user_data['total_song_duration_7d']

    # return the final user_data DataFrame
    return user_data

In [231]:
user_data_train = feature_engineering(train)
user_data_train.head()

  df_pred_window = df[pred_mask]


Unnamed: 0,userId,churned,total_events,songs_played,ads_seen,total_song_duration,total_events_7d,songs_played_7d,ads_seen_7d,total_song_duration_7d,event_ratio_14d_7d,song_ratio_14d_7d,song_duration_diff_14d_7d
0,1749042,1,1129,954.0,0.0,238954.25147,535.0,460.0,0.0,114813.70156,2.106343,2.069414,124140.54991
1,1563081,0,28,25.0,0.0,6304.7201,28.0,25.0,0.0,6304.7201,0.965517,0.961538,0.0
2,1697168,0,429,344.0,18.0,85487.28599,301.0,247.0,8.0,60571.55164,1.42053,1.387097,24915.73435
3,1222580,0,1704,1424.0,7.0,356373.32347,873.0,745.0,0.0,190437.43526,1.949657,1.908847,165935.88821
4,1714398,0,540,461.0,2.0,118792.69868,335.0,281.0,2.0,71528.3101,1.607143,1.634752,47264.38858


**First Logit Baseline:**

Our dataset has 19140 user, of which 4271 are churners (22%). Thus, the data is class-imbalanced and naive accuracy is not reliable. E.g., we can compute balanced accuracy as a performance measure. 

First, we must create a train/validation split since we are not allowed to touch the test data in any of our steps before the final model is decided on. The train/validation split enables us to train a model on 80% of the train set users and evaluate generalization on the remaining 20%. By observing the validation set performance, we can tune hyperparameters without touching the final test set. As our data is user-level aggregated and we assume the behavior of the users to be independent, a random split is tenable. All features were computed solely based on data before $T_\text{pred}$, so no time leakage occurs.

As a first baseline, we use a logistic regression classifier that models 

$P(\text{churn} = 1 | X) = \sigma ( \beta_0 + \beta_1 x_1 + \beta_2 x_2 + ...)$

In our case we believe days_since_last_event to have a positive impact on churn probability since the users are inactive and events_last_7d and songs_last_7d to have negative coefficients since more recent interactions indicate satisfaction of the user and a lower likelihood to cancel the plan.

We use logistic regression as a first baseline since it is fast to estimate, easy to interpret and can provide calibrated probabilities. 

**Why use the balanced accuracy?**

Since we have 78% non-churners in our data, predicting non-churn for all users would yield an accuracy of 78% despite the model being worthless. The balanced accuracy is

$\text{Balanced Accuracy} = \frac{\text{TPR} + \text{TNR}}{2}$

Thus, it gives equal weight to predicting churners and non-churners and penalizes models that ignore the minority, i.e., churners in our case.

**Procedure:**

We select the features days_since_last_event, events_last_7d and songs_last_7d and the target vector churned. Then, we perform a train/validation split with val_size=0.2 and stratification. This means that the 22% churn proportion is preserved both in the test and the validation set. 

Then, we fit the LogisticRegression, optionally with scaled features or regularization but first in the most simple specification. Using this model, we predict the churn on the validation set and compute as performance metrics the accuracy and the balanced accuracy. 

**OUTDATED**

**Export the user_base data**:

Now, we created a dataset that is user-aggregated and contains the different features without time leakage and the binary target variable "churned". We export this as a csv file such that we do not have to re-run the feature engineering code each time before fitting different classifier models.

**END OF OUTDATED**

In [233]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
# AutoML style implementation
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# select features and target variable
features = ['total_events', 'songs_played', 'total_events_7d', 'songs_played_7d',
            'event_ratio_14d_7d', 'song_ratio_14d_7d', 'ads_seen', 'ads_seen_7d',
            'total_song_duration', 'total_song_duration_7d', 'song_duration_diff_14d_7d']
target = 'churned'

X = user_data_train[features]
y = user_data_train[target]

# train validation split with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, 
    stratify=y,  # keeps churn proportion in both sets
    random_state=42
)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")

# use data and train-validation split from above
# define a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),  # we'll allow passthrough
    ('clf', LogisticRegression())  # will be replaced
])

param_distributions = [
    # Logistic Regression with regularization
    {
        'scaler': [StandardScaler(), 'passthrough'],
        'clf': [LogisticRegression(max_iter=1000, class_weight='balanced')],
        'clf__C': np.logspace(-2, 2, 10),
        'clf__penalty': ['l2'],
        'clf__solver': ['lbfgs', 'liblinear']
    },
    # Random Forest
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [RandomForestClassifier(class_weight='balanced', random_state=42)],
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 5, 10, 20],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4]
    },
    # Gradient Boosting
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [GradientBoostingClassifier(random_state=42)],
        'clf__n_estimators': [100, 200, 300],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__max_depth': [3, 5, 7]
    },
    # LightGBM
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [LGBMClassifier(objective='binary', boosting_type='gbdt', class_weight='balanced', random_state=42)],
        'clf__num_leaves': [15, 31, 63, 127],
        'clf__max_depth': [-1, 5, 8, 12],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__n_estimators': [100, 200, 400],
        'clf__min_child_samples': [10, 20, 50],
        'clf__subsample': [0.6, 0.8, 1.0],
        'clf__colsample_bytree': [0.6, 0.8, 1.0]
    },
    # AdaBoost with Decision Tree base estimator
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [AdaBoostClassifier(
            estimator=DecisionTreeClassifier(class_weight='balanced', max_depth=1),
            random_state=42
        )],
        'clf__n_estimators': [50, 100, 200, 400],
        'clf__learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
    }
]

# Configure AutoML Search
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=50,  # number of parameter settings sampled
    scoring='balanced_accuracy',
    n_jobs=-1,
    cv=cv,
    verbose=2,
    random_state=42
)
# refit=True by default, the best hyperparameters are used to refit the model on the whole training set

# run the search
search.fit(X_train, y_train)

print("Best CV balanced acuracy:", search.best_score_)
print("Best parameters:")
for k, v in search.best_params_.items():
    print(f"  {k}: {v}")
best_model = search.best_estimator_

# predict the valdation set
y_pred = best_model.predict(X_val)

# performance metrics
acc = accuracy_score(y_val, y_pred)
bacc = balanced_accuracy_score(y_val, y_pred)

print("Classification report:")
print(classification_report(y_val, y_pred))
print(f"Accuracy: {acc:.4f}")
print(f"Balanced Accuracy: {bacc:.4f}")

Training set size: 13016 samples
Validation set size: 3255 samples
Fitting 5 folds for each of 50 candidates, totalling 250 fits


20 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\juliu\miniconda3\envs\pyenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\juliu\miniconda3\envs\pyenv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\juliu\miniconda3\envs\pyenv\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^

[LightGBM] [Info] Number of positive: 661, number of negative: 12355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2429
[LightGBM] [Info] Number of data points in the train set: 13016, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Best CV balanced acuracy: 0.6369157022094252
Best parameters:
  scaler: passthrough
  clf__subsample: 1.0
  clf__num_leaves: 15
  clf__n_estimators: 100
  clf__min_child_samples: 10
  clf__max_depth: 12
  clf__learning_rate: 0.01
  clf__colsample_bytree: 0.6
  clf: LGBMClassifier(class_weight='balanced', objective='binary', random_state=42)
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.72      0.83      3090
           1       0.09      0.51  

In [234]:
best_model

0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,15
,max_depth,12
,learning_rate,0.01
,n_estimators,100
,subsample_for_bin,200000
,objective,'binary'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


**Test Estimation for Kaggle upload**

We run our best model on the so far unseen test data, compute the binary predictions and upload this attempt to kaggle.

In [235]:
# estimation for test set
def feature_engineering_test(df):
    # global timestamps
    min_ts = df['time'].min()
    max_ts = df['time'].max()

    delta_7d = pd.Timedelta(days=7)
    delta_14d = pd.Timedelta(days=14)

    # 14 DAYS WINDOW
    # observation window
    observation_start = max_ts - delta_14d
    observation_end = max_ts

    # create mask for observation window
    obs_mask = (df['time'] >= observation_start) & (df['time'] < observation_end)
    # create subset of users present in observation window
    users_in_obs_window = df[obs_mask]['userId'].unique()
    df_obs = df[df['userId'].isin(users_in_obs_window)]

    # user level table:
    user_data_obs = pd.DataFrame({'userId': users_in_obs_window})

    # create and merge features
    features_1 = create_features(df_obs, observation_start, observation_end)

    # 7 DAYS WINDOW
    observation_start = max_ts - delta_7d
    features_2 = create_features(df_obs, observation_start, observation_end)

    # merge features to user_data_obs
    user_data_obs = user_data_obs.merge(features_1, on='userId', how='left', suffixes=('', '_14d'))
    user_data_obs = user_data_obs.merge(features_2, on='userId', how='left', suffixes=('', '_7d'))
    # create ratio features: 14d / 7d
    user_data_obs['event_ratio_14d_7d'] = user_data_obs['total_events'] / (user_data_obs['total_events_7d'] + 1) # avoid division by zero
    user_data_obs['song_ratio_14d_7d'] = user_data_obs['songs_played'] / (user_data_obs['songs_played_7d'] + 1) # avoid division by zero

    # create difference features: 14d - 7d for song_duration
    user_data_obs['song_duration_diff_14d_7d'] = user_data_obs['total_song_duration'] - user_data_obs['total_song_duration_7d']
    
    # create set of all userIds
    all_users = df['userId'].unique()
    # merge user_data_obs with all_users
    user_base_test = pd.DataFrame({'userId': all_users})
    user_base_test = user_base_test.merge(user_data_obs, on='userId', how='left')

    return user_base_test

In [236]:
user_data_test = feature_engineering_test(test)
# fill missing values with 0
user_data_test.fillna(0, inplace=True)
user_data_test.head(10)

Unnamed: 0,userId,total_events,songs_played,ads_seen,total_song_duration,total_events_7d,songs_played_7d,ads_seen_7d,total_song_duration_7d,event_ratio_14d_7d,song_ratio_14d_7d,song_duration_diff_14d_7d
0,1465194,426.0,351.0,7.0,86564.85627,272.0,224.0,0.0,55449.83794,1.56044,1.56,31115.01833
1,1261737,166871.0,0.0,0.0,0.0,82289.0,0.0,0.0,0.0,2.027841,0.0,0.0
2,1527155,351.0,301.0,0.0,76267.38353,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1507202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1429412,2288.0,1923.0,0.0,473765.05769,1614.0,1357.0,0.0,334307.48585,1.416718,1.416053,139457.57184
5,1778785,1016.0,857.0,1.0,209068.33306,209.0,173.0,0.0,41526.7787,4.838095,4.925287,167541.55436
6,1776591,407.0,341.0,1.0,87213.03724,44.0,38.0,0.0,9214.38932,9.044444,8.74359,77998.64792
7,1937373,2345.0,1983.0,2.0,486856.08711,1244.0,1060.0,1.0,263746.54971,1.883534,1.868992,223109.5374
8,1959334,62.0,51.0,0.0,12820.74421,50.0,43.0,0.0,10908.76743,1.215686,1.159091,1911.97678
9,1138878,23.0,21.0,0.0,6604.32029,23.0,21.0,0.0,6604.32029,0.958333,0.954545,0.0


In [237]:
# select features 
print("Features:", features)
X = user_data_test[features]

# predict in two cases: churn (0) if no data available for user
missing_mask = X.isna().any(axis=1)

# initialize predictions array: churn prediction is zero
y_pred = np.zeros(X.shape[0], dtype=int)
# predict for users with data

# users with data
X_valid = X[~missing_mask]
y_pred_valid = best_model.predict(X_valid)

# write predictions in at correct positions
y_pred[~missing_mask] = y_pred_valid


# create dataframe with userId and predictions
predictions_df = pd.DataFrame({
    'id': user_data_test['userId'],
    'target': y_pred
})

predictions_df.head(10)

Features: ['total_events', 'songs_played', 'total_events_7d', 'songs_played_7d', 'event_ratio_14d_7d', 'song_ratio_14d_7d', 'ads_seen', 'ads_seen_7d', 'total_song_duration', 'total_song_duration_7d', 'song_duration_diff_14d_7d']


Unnamed: 0,id,target
0,1465194,1
1,1261737,0
2,1527155,0
3,1507202,0
4,1429412,1
5,1778785,1
6,1776591,1
7,1937373,1
8,1959334,0
9,1138878,0


In [238]:
X.shape, y_pred.shape

((2904, 11), (2904,))

In [239]:
# export as csv
predictions_df.to_csv("churn_predictions.csv", index=False)

In [240]:
predictions_df.shape

(2904, 2)

In [241]:
# count values of predictions_df target
predictions_df['target'].mean()

np.float64(0.3650137741046832)