In [2]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Set plotting style
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set_palette('deep')

# Connect to the SQLite database
conn = sqlite3.connect('papcorns.sqlite')

# Load data
users_df = pd.read_sql_query("SELECT * FROM users;", conn)
events_df = pd.read_sql_query("SELECT * FROM user_events;", conn)

# Convert created_at to datetime
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
events_df['created_at'] = pd.to_datetime(events_df['created_at'])

# Feature extraction per user
user_features = events_df.groupby('user_id').agg(
    total_amount_paid=('amount_usd', 'sum'),
    days_active=('created_at', lambda x: (x.max() - x.min()).days + 1),
    total_events=('event_name', 'count'),
    trial_started_count=('event_name', lambda x: (x == 'trial_started').sum()),
    trial_cancelled_count=('event_name', lambda x: (x == 'trial_cancelled').sum()),
    subscription_started_count=('event_name', lambda x: (x == 'subscription_started').sum()),
    subscription_renewed_count=('event_name', lambda x: (x == 'subscription_renewed').sum()),
    subscription_cancelled_count=('event_name', lambda x: (x == 'subscription_cancelled').sum())
).reset_index()

# Merge with users table for categorical features
user_features = user_features.merge(
    users_df[['id', 'country', 'attribution_source']],
    left_on='user_id', right_on='id'
)

# One-hot encoding
user_features = pd.get_dummies(user_features, columns=['country', 'attribution_source'])

# Prepare model data
X = user_features.drop(columns=['user_id', 'id', 'total_amount_paid'])
y = user_features['total_amount_paid']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# --- Prediction for Bruce Wayne ---

# Get Bruce Wayne's user row
user_1001 = users_df[users_df['id'] == 1001]
user_1001_events = events_df[events_df['user_id'] == 1001]

# Feature extraction
user_1001_features = {
    'days_active': (user_1001_events['created_at'].max() - user_1001_events['created_at'].min()).days + 1,
    'total_amount_paid': user_1001_events['amount_usd'].sum() or 0,
    'country': user_1001['country'].values[0],
    'attribution_source': user_1001['attribution_source'].values[0],
    'total_events': user_1001_events.shape[0],
    'trial_started_count': (user_1001_events['event_name'] == 'trial_started').sum(),
    'trial_cancelled_count': (user_1001_events['event_name'] == 'trial_cancelled').sum(),
    'subscription_started_count': (user_1001_events['event_name'] == 'subscription_started').sum(),
    'subscription_renewed_count': (user_1001_events['event_name'] == 'subscription_renewed').sum(),
    'subscription_cancelled_count': (user_1001_events['event_name'] == 'subscription_cancelled').sum()
}

# Convert to DataFrame and encode
user_1001_df = pd.DataFrame([user_1001_features])
user_1001_encoded = pd.get_dummies(user_1001_df, columns=['country', 'attribution_source'])

# Align with training features
user_1001_encoded = user_1001_encoded.reindex(columns=X_train.columns, fill_value=0)

# Predict pLTV
pltv_prediction = model.predict(user_1001_encoded)
print(f"Predicted pLTV for Bruce Wayne (User #1001): ${pltv_prediction[0]:.2f}")


Predicted pLTV for Bruce Wayne (User #1001): $10.81
