In [2]:
import numpy as np 
import pandas as pd 
#importing required libs

In [None]:
np.random.seed() 
#set random seed


In [4]:
n = 5000 
#creating 5000 fake players

In [5]:
user_ids = np.arange(1, n+1) 
#creating player ID 1 - 5000

In [6]:
sessions_last_7d = np.random.poisson(lam=3, size=n)
#simulates how many sessions each user played in the last week
#poisson is a distribution that gives a probability of something happening.

In [7]:
avg_bet_size = np.random.gamma(shape=2, scale=20, size =n) 
#simulates avg bet size using gamma distribution
#gamma distribution is used to model how long it takes for something to happen

In [8]:
countries = np.random.choice(["AU", "NZ", "CA","UK","BR"], size=n) 
#randomly assigns each player to a country|

In [9]:
device = np.random.choice(["iOS","Android","Desktop"],size=n) 
#assigns device type

In [10]:
churn_prob = 1 / (1 + np.exp(-( -1 +
                               0.3 * (sessions_last_7d < 2) + 
                               0.01 * avg_bet_size)))
#probability of churn based on behavioural rules. people with <2 sessions are more likely to churn, people with high bet sizes are less likely to churn
#logisitic function to convert to 0-1 probabilities (like a real churn model)

In [11]:
churned = np.random.binomial(1,churn_prob)
#turns probabilities into 0/1 outcomes

In [12]:
df = pd.DataFrame({"user_id": user_ids,
                   "sessions_last_7d": sessions_last_7d,
                   "avg_bet_size": avg_bet_size,
                   "country": countries,
                   "device": device,
                   "churned": churned
                   })

In [13]:
df.head()

Unnamed: 0,user_id,sessions_last_7d,avg_bet_size,country,device,churned
0,1,2,7.706689,NZ,Android,0
1,2,5,34.583118,UK,iOS,0
2,3,2,42.84973,UK,iOS,0
3,4,4,21.621683,UK,Android,0
4,5,2,59.904635,AU,Android,0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#imports tools required

In [15]:
X = df.drop("churned", axis=1)
y = df["churned"]
#features (X), labels (y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#splits 80% training, 20% testing

In [17]:
categorical = ["country", "device"]
numeric = ["sessions_last_7d", "avg_bet_size"]
#columns to be processed

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric)
    ]
)
#hot encodes categorical data

In [26]:
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("lr", LogisticRegression(max_iter=1000))
])
#creating a model that preprocessing data, and trains logisitic regression all into one object

In [27]:
model.fit(X_train, y_train)
#practicing enough with entire df so model knows what works

0,1,2
,steps,"[('prep', ...), ('lr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [28]:
from sklearn.metrics import classification_report

preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.64      0.96      0.77       636
           1       0.49      0.07      0.12       364

    accuracy                           0.64      1000
   macro avg       0.57      0.51      0.45      1000
weighted avg       0.59      0.64      0.53      1000



In [29]:
#model is very bad at identifying loyal customers - recall is 0.07, meaning 7% churners were caught
#dataset is possibly imbalanced and maybe use different algo that is resistent to imbalance

In [30]:
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("lr", LogisticRegression(max_iter=1000, class_weight='balanced'))
])

In [31]:
model.fit(X_train, y_train)

preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.69      0.67      0.68       636
           1       0.45      0.48      0.47       364

    accuracy                           0.60      1000
   macro avg       0.57      0.58      0.57      1000
weighted avg       0.61      0.60      0.60      1000



In [None]:
#1 recall significantly better but still not good enough, now it catches 48% of churners, lets try a different model.

In [32]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline(steps=[
    ("prep", preprocessor),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        class_weight="balanced",  # handles churn imbalance
        random_state=42
    ))
])

# --------------------------
# 5. TRAIN/TEST SPLIT
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# 6. TRAIN
# --------------------------
rf_model.fit(X_train, y_train)

# --------------------------
# 7. EVALUATE
# --------------------------
y_pred = rf_model.predict(X_test)

print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))



CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.64      0.68      0.66       636
           1       0.38      0.34      0.36       364

    accuracy                           0.56      1000
   macro avg       0.51      0.51      0.51      1000
weighted avg       0.55      0.56      0.55      1000

CONFUSION MATRIX:
[[435 201]
 [242 122]]


In [33]:
#34% of churners caught - not as good as the BALANCED logisitic regression

In [36]:
from sklearn.ensemble import GradientBoostingClassifier

# --------------------------
# SAMPLE DATA (example)
# --------------------------
df = pd.DataFrame({
    "age": np.random.randint(18, 70, 1000),
    "region": np.random.choice(["VIC", "NSW", "QLD", "WA"], 1000),
    "tenure_months": np.random.randint(1, 48, 1000),
    "sessions_last_7d": np.random.poisson(lam=3, size=1000),
    "churn": np.random.choice([0, 1], 1000, p=[0.7, 0.3])  # imbalanced
})

# --------------------------
# SPLIT FEATURES / TARGET
# --------------------------
X = df.drop("churn", axis=1)
y = df["churn"]

# --------------------------
# PREPROCESSING
# --------------------------
numeric_features = ["age", "tenure_months", "sessions_last_7d"]
categorical_features = ["region"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# --------------------------
# GRADIENT BOOSTING PIPELINE
# --------------------------
gb_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("gb", GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        subsample=0.9,       # adds stochasticity, prevents overfitting
        random_state=42
    ))
])

# --------------------------
# TRAIN/TEST SPLIT
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# TRAIN
# --------------------------
gb_model.fit(X_train, y_train)

# --------------------------
# EVALUATE
# --------------------------
y_pred = gb_model.predict(X_test)

print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))



CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.70      0.89      0.78       139
           1       0.35      0.13      0.19        61

    accuracy                           0.66       200
   macro avg       0.52      0.51      0.49       200
weighted avg       0.59      0.66      0.60       200

CONFUSION MATRIX:
[[124  15]
 [ 53   8]]
