In [53]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv("Assignment_data.csv")

# Convert Unix Epoch Time to standard datetime
data['time'] = pd.to_datetime(data['time'], unit='s')

# Sort data by device and time
data = data.sort_values(by=['device', 'time'])

# Create a session identifier by grouping consecutive gameplays
data['session_id'] = (data['device'] != data['device'].shift()) | (data['time'].diff() > pd.Timedelta(hours=1))
data['session_id'] = data['session_id'].cumsum()

# Calculate Average Score per Session
session_scores = data.groupby(['device', 'session_id'])['score'].sum().reset_index()
avg_score_per_session = session_scores.groupby('device')['score'].mean().rename('avg_score_per_session')

# Calculate Total Sessions per Player
total_sessions = session_scores.groupby('device').size().rename('total_sessions')

# Calculate Session Frequency (average time between sessions)
session_times = data.groupby(['device', 'session_id'])['time'].max().reset_index()
session_times['time_diff'] = session_times.groupby('device')['time'].diff().dt.total_seconds()
session_frequency = session_times.groupby('device')['time_diff'].mean().rename('session_frequency')

# Calculate Max Score in a Session
max_score_per_session = session_scores.groupby('device')['score'].max().rename('max_score_per_session')

# Calculate Recency of Last Session
last_session_time = session_times.groupby('device')['time'].max()
data_end_time = data['time'].max()
recency_of_last_session = (data_end_time - last_session_time).dt.days.rename('recency_of_last_session')

# Combine all new features into a single dataframe
new_features = pd.concat([
    avg_score_per_session,
    total_sessions,
    session_frequency,
    max_score_per_session,
    recency_of_last_session
], axis=1).reset_index()

# Merge new features back into the original data
data = data.merge(new_features, on='device', how='left')

# Drop intermediate columns that are no longer needed
data = data.drop(columns=['session_id'], errors='ignore')

# Create the churn feature
churn_period = pd.Timedelta(days=14)
data['last_activity'] = data.groupby('device')['time'].transform('max')
data['churn'] = (data['last_activity'] + churn_period < data['time'].max()).astype(int)


### Handling missing data

In [54]:

# Check missing values before imputation
print("Missing values before imputation:")
print(data[['avg_score_per_session', 'total_sessions', 'session_frequency', 
           'max_score_per_session', 'recency_of_last_session']].isnull().sum())

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data[['avg_score_per_session', 'total_sessions', 'session_frequency', 
      'max_score_per_session', 'recency_of_last_session']] = imputer.fit_transform(
    data[['avg_score_per_session', 'total_sessions', 'session_frequency', 
          'max_score_per_session', 'recency_of_last_session']]
)

# Check missing values after imputation
print("\nMissing values after imputation:")
print(data[['avg_score_per_session', 'total_sessions', 'session_frequency', 
           'max_score_per_session', 'recency_of_last_session']].isnull().sum())


Missing values before imputation:
avg_score_per_session          9
total_sessions                 9
session_frequency          39425
max_score_per_session          9
recency_of_last_session        9
dtype: int64

Missing values after imputation:
avg_score_per_session      0
total_sessions             0
session_frequency          0
max_score_per_session      0
recency_of_last_session    0
dtype: int64


### Train/test split

In [55]:

# Prepare features and target variable
features = ['avg_score_per_session', 'total_sessions', 'session_frequency', 
            'max_score_per_session', 'recency_of_last_session']
X = data[features]
y = data['churn']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


## SMOTE

In [56]:

# Print percentage of churners in the training set before SMOTE
churners_before = y_train.sum() / len(y_train) * 100
print(f"Percentage of churners before SMOTE: {churners_before:.2f}%")

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print percentage of churners in the training set after SMOTE
churners_after = y_train_resampled.sum() / len(y_train_resampled) * 100
print(f"Percentage of churners after SMOTE: {churners_after:.2f}%")


Percentage of churners before SMOTE: 97.26%
Percentage of churners after SMOTE: 50.00%


## Logistic Regression

In [57]:

# Train a logistic regression model with 5-fold cross-validation
model = LogisticRegression(random_state=42, max_iter=1000)

# Perform Stratified 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=skf, scoring='accuracy')

# Print Cross-Validation Results
print("\nCross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

# Train the model on the full resampled training data
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model on the test set
print("\nAccuracy Score on Test Set:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix on Test Set:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report on Test Set:\n", classification_report(y_test, y_pred))



Cross-Validation Accuracy Scores: [0.98370628 0.98499416 0.98430231 0.985519   0.98270391]
Mean Cross-Validation Accuracy: 0.9842451334480821

Accuracy Score on Test Set: 0.9734078260681263

Confusion Matrix on Test Set:
 [[ 1265     3]
 [ 1225 43686]]

Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.51      1.00      0.67      1268
           1       1.00      0.97      0.99     44911

    accuracy                           0.97     46179
   macro avg       0.75      0.99      0.83     46179
weighted avg       0.99      0.97      0.98     46179



## Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Random Forest: Define parameter grid
rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Random Forest: Initialize model and grid search
rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_params,
    cv=5,
    scoring="accuracy",
    verbose=1,
    n_jobs=-1
)

# Fit Random Forest with grid search
rf_grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and accuracy for Random Forest
rf_best_params = rf_grid_search.best_params_
rf_best_score = rf_grid_search.best_score_
print("Random Forest Best Parameters:", rf_best_params)
print("Random Forest Best CV Accuracy:", rf_best_score)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Random Forest Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Best CV Accuracy: 1.0


## XGBoost

In [60]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# XGBoost: Define parameter grid
xgb_params = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [2, 3, 4, 6],
    # "subsample": [0.8, 1.0],
    # "colsample_bytree": [0.8, 1.0]
}

# XGBoost: Initialize model and grid search
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_params,
    cv=5,
    scoring="accuracy",
    verbose=1,
    n_jobs=-1
)

# Fit XGBoost with grid search
xgb_grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and accuracy for XGBoost
xgb_best_params = xgb_grid_search.best_params_
xgb_best_score = xgb_grid_search.best_score_
print("XGBoost Best Parameters:", xgb_best_params)
print("XGBoost Best CV Accuracy:", xgb_best_score)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



KeyboardInterrupt: 