In [3]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set plotting style
sns.set_theme(style="darkgrid")
sns.set_palette('deep')

# Connect to the SQLite database
conn = sqlite3.connect('papcorns.sqlite')

# Load users table
users_df = pd.read_sql_query("SELECT * FROM users;", conn)

# Load events table
events_df = pd.read_sql_query("SELECT * FROM user_events;", conn)


# Preparing data again
events_df['created_at'] = pd.to_datetime(events_df['created_at'])
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Create the churn tag (churn = 1 if subscription_cancelled, 0 otherwise)
events_df['churn'] = np.where(events_df['event_name'].isin(['trial_cancelled', 'subscription_cancelled']), 1, 0)


# Calculating user features
user_features = events_df.groupby('user_id').agg(
    churn=('churn', 'max'),
    days_to_subscribe=('created_at', lambda x: (x.max() - x.min()).days),
    total_amount_paid=('amount_usd', 'sum')
).reset_index()

# Merge user information and country information
user_features = user_features.merge(users_df[['id', 'country', 'attribution_source']], left_on='user_id', right_on='id')

# One-hot encoding
categorical_features = ['country', 'attribution_source']
user_features = pd.get_dummies(user_features, columns=categorical_features)

# Creating model
X = user_features.drop(columns=['churn', 'user_id', 'id'])
y = user_features['churn']

# Splitting into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predicting
y_pred = model.predict(X_test)

# Evaluating model accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

# Getting Clark Kent's data
user_1002 = users_df[users_df['id'] == 1002]
user_1002_events = events_df[events_df['user_id'] == 1002]

# Features as well
user_1002_features = {
    'days_to_subscribe': (user_1002_events['created_at'].max() - user_1002_events['created_at'].min()).days,
    'total_amount_paid': user_1002_events['amount_usd'].sum(),
    'country': user_1002['country'].values[0],
    'attribution_source': user_1002['attribution_source'].values[0]
}
user_1002_features_df = pd.DataFrame([user_1002_features])


# Get the column names of the training dataset with one-hot encoding applied
encoded_columns = X_train.columns

# When one-hot encoding user data, add the same column names
user_1002_encoded = pd.get_dummies(user_1002_features_df, columns=['country', 'attribution_source'])

# Add columns that are incompatible with the columns used in training and fill in the missing ones with 0
user_1002_encoded = user_1002_encoded.reindex(columns=encoded_columns, fill_value=0)

#Predicting
churn_probability = model.predict_proba(user_1002_encoded)[:, 1]

print(f"User #1002 (Clark Kent) churn probability: {churn_probability[0]:.2f}")





Accuracy: 0.99
Confusion Matrix:
[[127   2]
 [  1 171]]
User #1002 (Clark Kent) churn probability: 0.01
