In [10]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.preprocessing import StandardScaler

def prepare_survival_data(data: pd.DataFrame, observation_end=None):
    """
    Prepare data for survival analysis.
    
    Args:
        data: Event-level DataFrame
        observation_end: Cutoff date for observation window
    
    Returns:
        DataFrame ready for survival modeling
    """
    if observation_end is None:
        observation_end = data['ts'].max()
    else:
        observation_end = pd.Timestamp(observation_end)
    
    # Convert timestamps
    data['ts'] = pd.to_datetime(data['ts'])
    data['registration'] = pd.to_datetime(data['registration'])
    
    # Create derived columns BEFORE aggregating
    data['song_played'] = (data['page'] == 'NextSong').astype(int)
    
    # Calculate session length
    data = data.sort_values(['userId', 'sessionId', 'ts'])
    data['session_length'] = data.groupby(['userId', 'sessionId'])['ts'].transform(
        lambda x: (x.max() - x.min()).total_seconds()
    )
    
    # Identify churned users
    churned_users = set(data[data['page'] == 'Cancellation Confirmation']['userId'].unique())
    data['churned'] = data['userId'].isin(churned_users).astype(int)
    
    # Aggregate
    user_df = data.groupby('userId').agg({
        'gender': 'first',
        'registration': 'first',
        'level': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
        'sessionId': 'nunique',
        'itemInSession': ['max', 'mean'],
        'ts': ['min', 'max'],
        'session_length': ['mean', 'std', 'max'],
        'song_played': 'sum',
        'artist': 'nunique',
        'length': ['sum', 'mean'],
        'churned': 'max'    
    }).reset_index()
    
    user_df.columns = ['userId', 'gender', 'registration', 'level',
                       'num_sessions', 'max_item_in_session', 'avg_item_in_session',
                       'ts_min', 'ts_max', 
                       'avg_session_length_seconds', 'std_session_length', 'max_session_length',
                       'num_songs_played', 'unique_artists', 'total_length', 'avg_song_length',
                       'churned']
    
    # ENCODE CATEGORICAL VARIABLES
    # Gender: convert to numeric (assuming 'M'=1, 'F'=0, or use label encoding)
    gender_map = {'M': 1, 'F': 0}
    user_df['gender'] = user_df['gender'].map(gender_map).fillna(0).astype(int)
    
    # Level: convert to numeric (assuming 'paid'=1, 'free'=0)
    level_map = {'paid': 1, 'free': 0}
    user_df['level'] = user_df['level'].map(level_map).fillna(0).astype(int)
    
    # Calculate duration
    user_df['duration'] = (user_df['ts_max'] - user_df['registration']).dt.days
    user_df['duration'] = user_df['duration'].clip(lower=1)
    
    # Event indicator
    user_df['event'] = user_df['churned']
    
    # Temporal features
    user_df['days_active'] = (pd.to_datetime(observation_end) - pd.to_datetime(user_df['ts_min'])).dt.days
    user_df['membership_length'] = (pd.to_datetime(observation_end) - pd.to_datetime(user_df['registration'])).dt.days
    user_df['days_since_last_activity'] = (pd.to_datetime(observation_end) - pd.to_datetime(user_df['ts_max'])).dt.days
    
    # Engagement features
    user_df['songs_per_day'] = user_df['num_songs_played'] / (user_df['days_active'] + 1)
    user_df['sessions_per_day'] = user_df['num_sessions'] / (user_df['days_active'] + 1)
    user_df['songs_per_session'] = user_df['num_songs_played'] / (user_df['num_sessions'] + 1)
    
    # Diversity and behavior metrics
    user_df['artist_diversity_ratio'] = user_df['unique_artists'] / (user_df['num_songs_played'] + 1)
    user_df['avg_listening_time_per_day'] = user_df['total_length'] / (user_df['days_active'] + 1)
    user_df['inactivity_ratio'] = user_df['days_since_last_activity'] / (user_df['membership_length'] + 1)
    user_df['session_length_cv'] = user_df['std_session_length'] / (user_df['avg_session_length_seconds'] + 1)
    
    user_df = user_df.fillna(0)
    
    print(f"Processed {len(user_df)} users")
    print(f"Churn rate: {user_df['churned'].mean():.2%}")
    print(f"Censored rate: {(1 - user_df['event']).mean():.2%}")
    print(f"Median duration: {user_df['duration'].median()} days")
    
    return user_df


In [9]:
train_data = pd.read_parquet('Data/train.parquet')  # your event data


In [11]:
# Load and prepare training data
train_survival = prepare_survival_data(train_data)

# Select features for Cox model (drop datetime and redundant columns)
feature_cols = [
    'gender', 'level', 'num_sessions', 'max_item_in_session', 'avg_item_in_session',
    'avg_session_length_seconds', 'std_session_length', 'max_session_length',
    'num_songs_played', 'unique_artists', 'avg_song_length',
    'songs_per_day', 'sessions_per_day', 'songs_per_session',
    'artist_diversity_ratio', 'avg_listening_time_per_day',
    'inactivity_ratio', 'session_length_cv', 'days_since_last_activity',
    'duration', 'event'
]



Processed 19140 users
Churn rate: 22.31%
Censored rate: 77.69%
Median duration: 1.0 days


In [17]:
feature_cols = [
    'gender', 'level', 'num_sessions', 'max_item_in_session', 'avg_item_in_session',
    'avg_session_length_seconds', 'std_session_length', 'max_session_length',
    'num_songs_played', 'unique_artists', 'avg_song_length',
    'songs_per_day', 'sessions_per_day', 'songs_per_session',
    'artist_diversity_ratio', 'avg_listening_time_per_day',
    'inactivity_ratio', 'session_length_cv', 'days_since_last_activity',
    'duration', 'event'
]



train_cox = train_survival[feature_cols].copy()

# Additional check: remove any remaining low-variance columns
variance = train_cox.drop(columns=['duration', 'event']).var()
low_variance_cols = variance[variance < 0.01].index.tolist()
if low_variance_cols:
    print(f"Removing low-variance columns: {low_variance_cols}")
    train_cox = train_cox.drop(columns=low_variance_cols)

# Fit the model
from lifelines import CoxPHFitter

cph = CoxPHFitter(penalizer=0.1)
cph.fit(train_cox, duration_col='duration', event_col='event', 
         show_progress=True)

print("\nCox Model Summary:")
print(cph.summary)
print(f"\nConcordance Index: {cph.concordance_index_:.3f}")




Removing low-variance columns: ['avg_session_length_seconds', 'std_session_length', 'max_session_length', 'inactivity_ratio', 'session_length_cv', 'days_since_last_activity']
Iteration 1: norm_delta = 6.28e-02, step_size = 0.9500, log_lik = -41593.65349, newton_decrement = 1.50e+01, seconds_since_start = 0.0
Iteration 2: norm_delta = 3.28e-03, step_size = 0.9500, log_lik = -41578.78565, newton_decrement = 3.05e-02, seconds_since_start = 0.0
Iteration 3: norm_delta = 1.65e-04, step_size = 0.9500, log_lik = -41578.75526, newton_decrement = 7.77e-05, seconds_since_start = 0.0
Iteration 4: norm_delta = 7.32e-09, step_size = 1.0000, log_lik = -41578.75518, newton_decrement = 1.66e-13, seconds_since_start = 0.0
Convergence success after 4 iterations.

Cox Model Summary:
                                    coef  exp(coef)      se(coef)  \
covariate                                                           
gender                      2.667034e-02   1.027029  2.547269e-02   
level             

In [18]:

# For test set predictions
test_data = pd.read_parquet('Data/test.parquet')
test_survival = prepare_survival_data(test_data)
test_cox = test_survival[feature_cols].drop(columns=['duration', 'event'])

# Get risk scores (higher = higher risk of churning)
risk_scores = cph.predict_partial_hazard(test_cox)

# Convert to binary predictions (top 50% = churn for balanced test set)
threshold = np.percentile(risk_scores, 50)
y_pred = (risk_scores >= threshold).astype(int)

print(f"\nPredicted churn rate: {y_pred.mean():.2%}")

# Create submission
submission = pd.DataFrame({
    'id': test_survival['userId'],
    'target': y_pred
})

submission.to_csv('survival_submission.csv', index=False)
print("Submission saved!")

Processed 2904 users
Churn rate: 0.00%
Censored rate: 100.00%
Median duration: 1.0 days

Predicted churn rate: 50.00%
Submission saved!
