In [10]:
# similute data
#decide on features and labels
#decide train test split
#figure out which model to use
#test on new new data

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# 1. Simulate UFC 5 player data
np.random.seed(42)
n_samples = 500

data = pd.DataFrame({
    'player_id': range(n_samples),
    'total_matches': np.random.randint(5, 200, n_samples),
    'win_rate': np.round(np.random.uniform(0.1, 1.0, n_samples), 2),
    'last_login_days_ago': np.random.randint(0, 60, n_samples),
    'rank_level': np.random.randint(1, 10, n_samples),
    'avg_fight_duration': np.round(np.random.uniform(1.5, 15.0, n_samples), 2),
    'churn': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])  # 0 = retained, 1 = churned
})

# 2. Features and labels
features = ['total_matches', 'win_rate', 'last_login_days_ago', 'rank_level', 'avg_fight_duration']
X = data[features]
y = data['churn']

# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train)
print(y_train)
print(X_test)
print(y_test)


#we want to figure out the unknown uknowns of what will make the user churn using the model
# When I say "Your model is biased towards predicting non-churners," I mean:

# The model is more likely to predict a player as a non-churner (class 0) than as a churner (class 1).

# It predicts the non-churner class more accurately and confidently.

# Because of that, it misses a lot of actual churners (low recall for churners) and ends up with many false negatives (players who actually churned but the model said "no churn").

# So yes, the model tends to say "the player won’t churn" more often, which makes it good at identifying non-churners but bad at catching churners.

# If you want to focus on performance across all classes evenly (like in imbalance cases), look at macro avg.
# If you want a sense of overall performance considering class distribution, look at weighted avg.

# When Macro Average is better:
# You want to treat all classes equally, regardless of how many samples each class has.

# You care about how well the model performs on the minority class (e.g., predicting churners accurately, even if they’re fewer).

# You want to ensure your model is not just good at the majority class, but fair across all classes.

# Use case: If your main goal is to detect churners (the minority class) well and avoid ignoring them, focus on macro avg metrics.

# When Weighted Average is better:
# You want an overall measure of model performance on your dataset, reflecting the real class distribution.

# You care about the majority class accuracy because it represents most of the data.

# Your dataset is heavily imbalanced and you want the metric to reflect that.

# Use case: If your business needs a good overall performance and the majority class is very important, weighted avg is more representative.

# Summary
# Criterion	Macro Avg	Weighted Avg
# Treats classes equally	✅	❌ (weights by support)
# Sensitive to minority class	✅	Less sensitive
# Reflects dataset distribution	❌	✅
# Good for imbalanced data	✅	Depends on business priority

# Practical advice:
# Check both!
# Look at macro avg to understand fairness across classes and weighted avg for overall accuracy.

# For churn prediction, because the churn class is often small but very important, macro avg is usually more meaningful.

     total_matches  win_rate  last_login_days_ago  rank_level  \
249            199      0.51                   28           2   
433            184      0.13                   43           7   
19               6      0.55                   20           1   
322            149      0.77                   15           3   
332            113      0.31                   15           4   
..             ...       ...                  ...         ...   
106            140      0.84                    7           6   
270             94      0.35                   57           8   
348             39      0.88                   15           1   
435             74      0.79                   15           3   
102             75      0.19                   50           6   

     avg_fight_duration  
249                6.73  
433                2.17  
19                 4.68  
322                7.08  
332                3.67  
..                  ...  
106                5.28  
270        

In [7]:
# 4. Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [8]:
# 5. Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

#classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.64
Confusion Matrix:
[[61  5]
 [31  3]]
              precision    recall  f1-score   support

           0       0.66      0.92      0.77        66
           1       0.38      0.09      0.14        34

    accuracy                           0.64       100
   macro avg       0.52      0.51      0.46       100
weighted avg       0.57      0.64      0.56       100



In [9]:
# 6. Predict churn for current players
new_players = pd.DataFrame({
    'total_matches': [80, 15],
    'win_rate': [0.75, 0.25],
    'last_login_days_ago': [3, 27],
    'rank_level': [7, 2],
    'avg_fight_duration': [6.2, 9.0]
})
predictions = clf.predict(new_players)
print("\nPredicted churn (1 = likely to leave):", predictions)


Predicted churn (1 = likely to leave): [0 0]
