In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
train = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")

train.shape, test.shape

((17499636, 19), (4393179, 19))

In [3]:
train.head()

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
0,200,M,Shlok,paid,Johnson,1749042,1538352001000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21
992,200,M,Shlok,paid,Johnson,1749042,1538352525000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21
1360,200,M,Shlok,paid,Johnson,1749042,1538352703000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21
1825,200,M,Shlok,paid,Johnson,1749042,1538352935000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21
2366,200,M,Shlok,paid,Johnson,1749042,1538353200000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21


In [4]:
train.columns

Index(['status', 'gender', 'firstName', 'level', 'lastName', 'userId', 'ts',
       'auth', 'page', 'sessionId', 'location', 'itemInSession', 'userAgent',
       'method', 'length', 'song', 'artist', 'time', 'registration'],
      dtype='object')

In [5]:
train.dtypes

status                    int64
gender                   object
firstName                object
level                    object
lastName                 object
userId                   object
ts                        int64
auth                     object
page                     object
sessionId                 int64
location                 object
itemInSession             int64
userAgent                object
method                   object
length                  float64
song                     object
artist                   object
time             datetime64[us]
registration     datetime64[us]
dtype: object

In [6]:
train['time'].min(), train['time'].max()

(Timestamp('2018-10-01 00:00:01'), Timestamp('2018-11-20 00:00:00'))

**Conceptual Outline**:

Our key columns are userId, time and page. We want features that describe how the user behaved before the prediction point, not after, to prevent leakage from the future. 

Thus, we define a prediction time for each user. We want to predict churn in the next 10 days, thus we need a moment in time $T_\text{pred}$ at which we pretend to make the prediction. 

Then, there are two options. 1) The user churns (has a "cancellation confirmation" event), then we set $T_\text{pred}$ 10 days before $T_\text{churn}$. 2) The user does not churn in the training set, then we set the prediction time 10 days before the last observed event. $T_\text{pred} = T_\text{pred} - 10 \text{days}$.

Everythin we compute as features must only ever use events with time before $T_\text{pred}$, otherwise we have data leakage.

**Feature 1**:

I propose "recency of activity" , i.e. days since last event, as a feature predicting churn. Users who have not done anything for a long time are more likely to churn. For each user, we look at all events with timestamp before $T_\text{pred}$ and find the last event before $T_\text{pred}$.

A possible interpretation would be: Small value: The user was active right before the prediction point and has lower churn risk. Large value: The user was inactive for many days, higher churn risk.

**Feature 2:**

I propose short-term activity level as a predictor of churn. 

We define a seven day lookback window $[T_\text{pred} - 7 \text{days}, T_\text{pred}]$. Then, we count all events in this window and construct the feature "events_last_7d". 

Rationale: Many events in the last seven days before $T_\text{pred}$ indicates that the user likes the service, uses it regularly and thus is unlikely to churn. If the user barely uses the servide, he is more likely to cancel the subscription.

**Feature 2.1:**

A very similar feature with a seven days lookback window can be included with the sole difference that we only consider songs played as events in the window. In the end, users use the service to listen to songs and thus the number of songs played in the last seven days before prediction_ts might be a better indicator of churn inclination than the total amount of events. 

In [7]:
df = train.copy()

In [8]:
# global timestamps
min_ts = df['time'].min()
max_ts = df['time'].max()
# define time deltas 
delta_7d = pd.Timedelta(days=7)
delta_10d = pd.Timedelta(days=10)

# first estimation window: observe 7 days after min_ts, predict churn in next 10 days
observation_start = min_ts
observation_end = observation_start + delta_7d
prediction_end = observation_end + delta_10d

# create mask for observation and prediction windows
obs_mask = (df['time'] >= observation_start) & (df['time'] < observation_end)
pred_mask = (df['time'] >= observation_end) & (df['time'] < prediction_end)

# check whether churn occurred in prediction window
churn_in_pred_window = df[pred_mask & (df['page'] == 'Cancellation Confirmation')]

# create subset of users present in observation window
users_in_obs_window = df[obs_mask]['userId'].unique()
df = df[df['userId'].isin(users_in_obs_window)]

# df contains users active in observation window
# restrict dataset to prediction window
df_pred_window = df[pred_mask]

# add churn flag per user
churned_users = churn_in_pred_window['userId'].unique()

# user level table: userId and binary churn flag
user_data = pd.DataFrame({'userId': users_in_obs_window})
user_data['churned'] = np.where(
    user_data['userId'].isin(churned_users), 1, 0
)

  df_pred_window = df[pred_mask]


In [9]:
user_data['churned'].value_counts()

churned
0    12654
1      854
Name: count, dtype: int64

In [10]:
user_data

Unnamed: 0,userId,churned
0,1749042,0
1,1563081,0
2,1697168,0
3,1222580,0
4,1714398,0
...,...,...
13503,1740270,0
13504,1486999,0
13505,1216336,0
13506,1775697,0


In [11]:
# create feature for observation window
def create_features(df, observation_start, observation_end):
    # Filter data for the observation window
    obs_mask = (df['time'] >= observation_start) & (df['time'] < observation_end)
    df_obs = df[obs_mask]
    
    # Initialize feature DataFrame
    user_ids = df_obs['userId'].unique()
    features = pd.DataFrame({'userId': user_ids})
    
    # Feature 1: Total events in observation window
    total_events = df_obs.groupby('userId').size().reset_index(name='total_events')
    features = features.merge(total_events, on='userId', how='left')
    
    # Feature 2: number of songs played
    songs_played = df_obs[df_obs['page'] == 'NextSong'].groupby('userId').size().reset_index(name='songs_played')
    features = features.merge(songs_played, on='userId', how='left')
    

    # New features: counts of specific page events
    page_features = {
        # Negative experiences
        'roll_advert': 'Roll Advert',
        'downgrade': 'Downgrade',
        'thumbs_down': 'Thumbs Down',
        # Positive experiences
        'upgrade': 'Upgrade',
        'add_playlist': 'Add to Playlist',
        'add_friend': 'Add Friend',
        'thumbs_up': 'Thumbs Up',
    }
    
    for col_name, page_name in page_features.items():
        page_counts = (
            df_obs[df_obs['page'] == page_name]
            .groupby('userId')
            .size()
            .reset_index(name=col_name)
        )
        features = features.merge(page_counts, on='userId', how='left')
    
    # Fill NaN values with 0 (users who never triggered a given page)
    features.fillna(0, inplace=True)
    
    return features

In [12]:
# create features for first observation window, then merge to user_data
features_1 = create_features(df, observation_start, observation_end)
user_data = user_data.merge(features_1, on='userId', how='left')

In [13]:
user_data

Unnamed: 0,userId,churned,total_events,songs_played,roll_advert,downgrade,thumbs_down,upgrade,add_playlist,add_friend,thumbs_up
0,1749042,0,535,460.0,0.0,2.0,7.0,0.0,13.0,5.0,19.0
1,1563081,0,28,25.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1697168,0,301,247.0,8.0,4.0,3.0,2.0,6.0,3.0,10.0
3,1222580,0,873,745.0,0.0,10.0,8.0,0.0,23.0,8.0,34.0
4,1714398,0,335,281.0,2.0,1.0,4.0,0.0,10.0,8.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...
13503,1740270,0,24,22.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
13504,1486999,0,27,21.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
13505,1216336,0,24,15.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0
13506,1775697,0,17,13.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0


In [14]:
# create one function for feature engineering
def feature_engineering(df):
    # global timestamps
    min_ts = df['time'].min()
    max_ts = df['time'].max()
    # define time deltas
    delta_7d = pd.Timedelta(days=7)
    delta_10d = pd.Timedelta(days=10)
    delta_14d = pd.Timedelta(days=14)

    # first estimation window: observe 14 days after min_ts, predict churn in next 10 days
    observation_start = min_ts
    observation_end = observation_start + delta_14d
    prediction_end = observation_end + delta_10d

    # create mask for observation and prediction windows
    obs_mask = (df['time'] >= observation_start) & (df['time'] < observation_end)
    pred_mask = (df['time'] >= observation_end) & (df['time'] < prediction_end)

    # check whether churn occurred in prediction window
    churn_in_pred_window = df[pred_mask & (df['page'] == 'Cancellation Confirmation')]

    # create subset of users present in observation window
    users_in_obs_window = df[obs_mask]['userId'].unique()
    df = df[df['userId'].isin(users_in_obs_window)]

    # df contains users active in observation window
    # restrict dataset to prediction window
    df_pred_window = df[pred_mask]

    # add churn flag per user
    churned_users = churn_in_pred_window['userId'].unique()

    # user level table: userId and binary churn flag
    user_data = pd.DataFrame({'userId': users_in_obs_window})
    user_data['churned'] = np.where(
        user_data['userId'].isin(churned_users), 1, 0
    )

    # create and merge features
    features_1 = create_features(df, observation_start, observation_end)
    user_data = user_data.merge(features_1, on='userId', how='left')

    # return the final user_data DataFrame
    return user_data

In [15]:
user_data_train = feature_engineering(train)
user_data_train.head()

  df_pred_window = df[pred_mask]


Unnamed: 0,userId,churned,total_events,songs_played,roll_advert,downgrade,thumbs_down,upgrade,add_playlist,add_friend,thumbs_up
0,1749042,1,1129,954.0,0.0,7.0,9.0,0.0,31.0,18.0,46.0
1,1563081,0,28,25.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1697168,0,429,344.0,18.0,4.0,5.0,2.0,8.0,3.0,16.0
3,1222580,0,1704,1424.0,7.0,15.0,15.0,4.0,40.0,25.0,68.0
4,1714398,0,540,461.0,2.0,2.0,5.0,0.0,13.0,10.0,24.0


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
# AutoML style implementation
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# select features and target variable
features = [
    'total_events',
    'songs_played',
    'roll_advert',
    'downgrade',
    'thumbs_down',
    'upgrade',
    'add_playlist',
    'add_friend',
    'thumbs_up',
]
target = 'churned'

X = user_data_train[features]
y = user_data_train[target]

# train validation split with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, 
    stratify=y,  # keeps churn proportion in both sets
    random_state=42
)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")

# use data and train-validation split from above
# define a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),  # we'll allow passthrough
    ('clf', LogisticRegression())  # will be replaced
])

param_distributions = [
    # Logistic Regression with regularization
    {
        'scaler': [StandardScaler(), 'passthrough'],
        'clf': [LogisticRegression(max_iter=1000, class_weight='balanced')],
        'clf__C': np.logspace(-2, 2, 10),
        'clf__penalty': ['l2'],
        'clf__solver': ['lbfgs', 'liblinear']
    },
    # Random Forest
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [RandomForestClassifier(class_weight='balanced', random_state=42)],
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 5, 10, 20],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4]
    },
    # Gradient Boosting
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [GradientBoostingClassifier(random_state=42)],
        'clf__n_estimators': [100, 200, 300],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__max_depth': [3, 5, 7]
    },
    # LightGBM
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [LGBMClassifier(objective='binary', boosting_type='gbdt', class_weight='balanced', random_state=42)],
        'clf__num_leaves': [15, 31, 63, 127],
        'clf__max_depth': [-1, 5, 8, 12],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__n_estimators': [100, 200, 400],
        'clf__min_child_samples': [10, 20, 50],
        'clf__subsample': [0.6, 0.8, 1.0],
        'clf__colsample_bytree': [0.6, 0.8, 1.0]
    },
    # AdaBoost with Decision Tree base estimator
    {
        'scaler': ['passthrough'],  # trees do not need scaling
        'clf': [AdaBoostClassifier(
            estimator=DecisionTreeClassifier(class_weight='balanced', max_depth=1),
            random_state=42
        )],
        'clf__n_estimators': [50, 100, 200, 400],
        'clf__learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
    }
]

# Configure AutoML Search
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=50,  # number of parameter settings sampled
    scoring='balanced_accuracy',
    n_jobs=-1,
    cv=cv,
    verbose=2,
    random_state=42
)
# refit=True by default, the best hyperparameters are used to refit the model on the whole training set

# run the search
search.fit(X_train, y_train)

print("Best CV balanced acuracy:", search.best_score_)
print("Best parameters:")
for k, v in search.best_params_.items():
    print(f"  {k}: {v}")
best_model = search.best_estimator_

# predict the valdation set
y_pred = best_model.predict(X_val)

# performance metrics
acc = accuracy_score(y_val, y_pred)
bacc = balanced_accuracy_score(y_val, y_pred)

print("Classification report:")
print(classification_report(y_val, y_pred))
print(f"Accuracy: {acc:.4f}")
print(f"Balanced Accuracy: {bacc:.4f}")

Training set size: 13016 samples
Validation set size: 3255 samples
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Number of positive: 529, number of negative: 9884
[LightGBM] [Info] Number of positive: 528, number of negative: 9884
[LightGBM] [Info] Number of positive: 529, number of negative: 9884
[LightGBM] [Info] Number of positive: 529, number of negative: 9884
[LightGBM] [Info] Number of positive: 529, number of negative: 9884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 884
[LightGBM] [Info] Number of data points in the train set: 10413, number of used features: 9
[LightGBM] [Info] Number of positive: 528, number



[CV] END clf=LGBMClassifier(class_weight='balanced', objective='binary', random_state=42), clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=5, clf__min_child_samples=10, clf__n_estimators=200, clf__num_leaves=31, clf__subsample=0.6, scaler=passthrough; total time=   5.3s




[CV] END clf=LGBMClassifier(class_weight='balanced', objective='binary', random_state=42), clf__colsample_bytree=0.8, clf__learning_rate=0.01, clf__max_depth=-1, clf__min_child_samples=50, clf__n_estimators=400, clf__num_leaves=15, clf__subsample=1.0, scaler=passthrough; total time=   8.2s




[CV] END clf=LGBMClassifier(class_weight='balanced', objective='binary', random_state=42), clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=-1, clf__min_child_samples=50, clf__n_estimators=100, clf__num_leaves=31, clf__subsample=0.8, scaler=passthrough; total time=   3.7s
[CV] END clf=LGBMClassifier(class_weight='balanced', objective='binary', random_state=42), clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=5, clf__min_child_samples=10, clf__n_estimators=200, clf__num_leaves=31, clf__subsample=0.6, scaler=passthrough; total time=   5.6s




[CV] END clf=LGBMClassifier(class_weight='balanced', objective='binary', random_state=42), clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=-1, clf__min_child_samples=50, clf__n_estimators=100, clf__num_leaves=31, clf__subsample=0.8, scaler=passthrough; total time=   3.7s
[LightGBM] [Info] Number of positive: 528, number of negative: 9884
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 884
[LightGBM] [Info] Number of data points in the train set: 10412, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[CV] END clf=LGBMClassifier(class_weight='balanced', objective='binary', random_state=42), clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=

In [17]:
best_model

**Test Estimation for Kaggle upload**

We run our best model on the so far unseen test data, compute the binary predictions and upload this attempt to kaggle.

In [18]:
# estimation for train
def feature_engineering_test(df):
    # global timestamps
    min_ts = df['time'].min()
    max_ts = df['time'].max()

    delta_7d = pd.Timedelta(days=7)
    delta_14d = pd.Timedelta(days=14)

    # observation window
    observation_start = max_ts - delta_14d
    observation_end = max_ts

    # create mask for observation window
    obs_mask = (df['time'] >= observation_start) & (df['time'] < observation_end)
    # create subset of users present in observation window
    users_in_obs_window = df[obs_mask]['userId'].unique()
    df_obs = df[df['userId'].isin(users_in_obs_window)]

    # user level table:
    user_data_obs = pd.DataFrame({'userId': users_in_obs_window})

    # create and merge features
    features_1 = create_features(df_obs, observation_start, observation_end)
    user_data_obs = user_data_obs.merge(features_1, on='userId', how='left')
    # create set of all userIds
    all_users = df['userId'].unique()
    # merge user_data_obs with all_users
    user_base_test = pd.DataFrame({'userId': all_users})
    user_base_test = user_base_test.merge(user_data_obs, on='userId', how='left')

    return user_base_test

In [19]:
user_data_test = feature_engineering_test(test)
# fill missing values with 0
user_data_test.fillna(0, inplace=True)
user_data_test.head(10)

Unnamed: 0,userId,total_events,songs_played,roll_advert,downgrade,thumbs_down,upgrade,add_playlist,add_friend,thumbs_up
0,1465194,426.0,351.0,7.0,3.0,3.0,2.0,10.0,8.0,19.0
1,1261737,166871.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1527155,351.0,301.0,0.0,2.0,7.0,0.0,5.0,5.0,13.0
3,1507202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1429412,2288.0,1923.0,0.0,17.0,17.0,0.0,53.0,42.0,108.0
5,1778785,1016.0,857.0,1.0,10.0,9.0,0.0,24.0,29.0,27.0
6,1776591,407.0,341.0,1.0,4.0,6.0,0.0,8.0,10.0,19.0
7,1937373,2345.0,1983.0,2.0,23.0,17.0,0.0,45.0,41.0,89.0
8,1959334,62.0,51.0,0.0,3.0,0.0,0.0,2.0,0.0,3.0
9,1138878,23.0,21.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [20]:
# select features 
features = [
    'total_events',
    'songs_played',
    'roll_advert',
    'downgrade',
    'thumbs_down',
    'upgrade',
    'add_playlist',
    'add_friend',
    'thumbs_up',
]
X = user_data_test[features]

# predict in two cases: churn (0) if no data available for user
missing_mask = X.isna().any(axis=1)

# initialize predictions array: churn prediction is zero
y_pred = np.zeros(X.shape[0], dtype=int)
# predict for users with data

# users with data
X_valid = X[~missing_mask]
y_pred_valid = best_model.predict(X_valid)

# write predictions in at correct positions
y_pred[~missing_mask] = y_pred_valid


# create dataframe with userId and predictions
predictions_df = pd.DataFrame({
    'id': user_data_test['userId'],
    'target': y_pred
})

predictions_df.head(10)

Unnamed: 0,id,target
0,1465194,1
1,1261737,0
2,1527155,1
3,1507202,0
4,1429412,1
5,1778785,1
6,1776591,1
7,1937373,1
8,1959334,0
9,1138878,0


In [21]:
X.shape, y_pred.shape

((2904, 9), (2904,))

In [22]:
# export as csv
predictions_df.to_csv("churn_predictions.csv", index=False)

In [23]:
predictions_df.shape

(2904, 2)

In [24]:
# count values of predictions_df target
predictions_df['target'].mean()

0.41735537190082644