In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

def import_and_transform(data):
    """Import and basic preprocessing only."""
    if isinstance(data, str):
        df = pd.read_parquet(data)
    else:
        df = data
    
    df = df[df['userId'] != '']
    df['userId'] = df['userId'].astype(int)
    df["gender"] = df["gender"].map({'F': 0, 'M': 1})
    df["level"] = df["level"].map({'free': 0, 'paid': 1})
    df['ts'] = pd.to_datetime(df['ts'], unit='ms')
    df['registration'] = pd.to_datetime(df['registration'])
    
    df['session_length'] = df.groupby(['userId', 'sessionId'])['ts'].transform(
        lambda x: (x.max() - x.min()).total_seconds()
    )
    df['song_played'] = (df['page'] == 'NextSong').astype(int)
    
    return df


def aggregate_features(data, observation_end):
    """Calculate features using only data up to observation_end."""
    observation_end = pd.Timestamp(observation_end)
    
    user_df = data.groupby('userId').agg({
        'gender': 'first',
        'registration': 'first',
        'level': lambda x: x.mode().iloc[0] if not x.mode().empty else 0,
        'sessionId': 'nunique',
        'itemInSession': 'max',
        'ts': ['min', 'max'],
        'session_length': 'mean',
        'song_played': 'sum',
        'artist': pd.Series.nunique,
        'length': 'sum'
    }).reset_index()
    
    user_df.columns = ['userId', 'gender', 'registration', 'level',
                       'num_sessions', 'max_item_in_session', 'ts_min', 'ts_max', 
                       'avg_session_length_seconds', 'num_songs_played', 
                       'unique_artists', 'total_length']
    
    user_df['days_active'] = (observation_end - user_df['ts_min']).dt.days
    user_df['membership_length'] = (observation_end - user_df['registration']).dt.days
    user_df['days_since_last_activity'] = (observation_end - user_df['ts_max']).dt.days
    user_df['songs_per_day'] = user_df['num_songs_played'] / (user_df['days_active'] + 1)
    user_df['sessions_per_day'] = user_df['num_sessions'] / (user_df['days_active'] + 1)
    
    user_df = user_df.fillna(0)
    user_df.set_index('userId', inplace=True)
    
    return user_df


def get_churned_users(df, start_date, end_date):
    """Get users who churned between start_date and end_date."""
    start = pd.Timestamp(start_date)
    end = pd.Timestamp(end_date)
    
    cancellations = df[df['page'] == 'Cancellation Confirmation']
    churned = cancellations[(cancellations['ts'] > start) & 
                           (cancellations['ts'] <= end)]['userId'].unique()
    return set(churned)


# Load training data
df_train = import_and_transform('Data/train.parquet')

# Prepare test data
df_test = import_and_transform('Data/test.parquet')



Obs date: 2018-10-15, Users: 16271, Churn rate: 5.08%
Obs date: 2018-10-20, Users: 17347, Churn rate: 4.48%
Obs date: 2018-10-25, Users: 17888, Churn rate: 4.49%
Obs date: 2018-10-30, Users: 18271, Churn rate: 4.46%
Obs date: 2018-11-04, Users: 18592, Churn rate: 3.78%


In [33]:
# Create multiple training samples with sliding window
training_dates = pd.date_range('2018-10-15', '2018-11-05', freq='5D')

X_train_list = []
y_train_list = []

for obs_date in training_dates:
    # Features from data up to obs_date
    df_obs = df_train[df_train['ts'] <= obs_date]
    features = aggregate_features(df_obs, obs_date)
    
    # Labels from next 10 days
    window_end = obs_date + pd.Timedelta(days=10)
    churned_users = get_churned_users(df_train, obs_date, window_end)
    
    # Convert to Series with same index as features
    labels = pd.Series(
        features.index.isin(churned_users).astype(int),
        index=features.index,
        name='churned'
    )
    
    X_train_list.append(features)
    y_train_list.append(labels)
    
    print(f"Obs date: {obs_date.date()}, Users: {len(features)}, Churn rate: {labels.mean():.2%}")

# Combine all training samples
X_train_combined = pd.concat(X_train_list)
y_train_combined = pd.concat(y_train_list)

# Drop non-feature columns
feature_cols = X_train_combined.select_dtypes(include=[np.number]).columns
feature_cols = [c for c in feature_cols if c not in ['registration', 'ts_min', 'ts_max', 'total_length']]
X_train_final = X_train_combined[feature_cols]

test_features = aggregate_features(df_test, '2018-11-20')
X_test = test_features[feature_cols]


Obs date: 2018-10-15, Users: 16271, Churn rate: 5.08%
Obs date: 2018-10-20, Users: 17347, Churn rate: 4.48%
Obs date: 2018-10-25, Users: 17888, Churn rate: 4.49%
Obs date: 2018-10-30, Users: 18271, Churn rate: 4.46%
Obs date: 2018-11-04, Users: 18592, Churn rate: 3.78%


In [54]:

# Train model
model = RandomForestClassifier(n_estimators=100, max_depth=13, random_state=42, class_weight='balanced')
model.fit(X_train_final, y_train_combined)

print(f"\nTotal training samples: {len(X_train_final)}")
print(f"Overall churn rate: {y_train_combined.mean():.2%}")


Total training samples: 88369
Overall churn rate: 4.44%


In [57]:
from utils import evaluate_model

evaluate_model(model, X_test, p=0.4, file_out='frog2.csv')

Base predicted churn: 8.33%
Predicted churn at 0.4 threshold: 28.17%
Submission saved to frog2.csv
