In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
import warnings 
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv('spotify_churn_dataset.csv')

In [13]:
df

Unnamed: 0,user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
0,1,Female,54,CA,Free,26,23,0.20,Desktop,31,0,1
1,2,Other,33,DE,Family,141,62,0.34,Web,0,1,0
2,3,Male,38,AU,Premium,199,38,0.04,Mobile,0,1,1
3,4,Female,22,CA,Student,36,2,0.31,Mobile,0,1,0
4,5,Other,29,US,Family,250,57,0.36,Mobile,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,7996,Other,44,DE,Student,237,36,0.30,Mobile,0,1,1
7996,7997,Male,34,AU,Premium,61,64,0.59,Mobile,0,1,0
7997,7998,Female,17,US,Free,81,62,0.33,Desktop,5,0,0
7998,7999,Female,34,IN,Student,245,94,0.27,Desktop,0,1,0


In [14]:
df.dtypes

user_id                    int64
gender                    object
age                        int64
country                   object
subscription_type         object
listening_time             int64
songs_played_per_day       int64
skip_rate                float64
device_type               object
ads_listened_per_week      int64
offline_listening          int64
is_churned                 int64
dtype: object

In [15]:
df.isnull().sum()

user_id                  0
gender                   0
age                      0
country                  0
subscription_type        0
listening_time           0
songs_played_per_day     0
skip_rate                0
device_type              0
ads_listened_per_week    0
offline_listening        0
is_churned               0
dtype: int64

In [16]:
# one hot encoding on gender column

In [17]:
gender_dummies = pd.get_dummies(df['gender'],prefix='gender',dtype=int) 
df = pd.concat([df, gender_dummies], axis=1) 
df.drop('gender', axis=1, inplace=True)

In [18]:
df.head()

Unnamed: 0,user_id,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned,gender_Female,gender_Male,gender_Other
0,1,54,CA,Free,26,23,0.2,Desktop,31,0,1,1,0,0
1,2,33,DE,Family,141,62,0.34,Web,0,1,0,0,0,1
2,3,38,AU,Premium,199,38,0.04,Mobile,0,1,1,0,1,0
3,4,22,CA,Student,36,2,0.31,Mobile,0,1,0,1,0,0
4,5,29,US,Family,250,57,0.36,Mobile,0,1,1,0,0,1


In [19]:
df.dtypes

user_id                    int64
age                        int64
country                   object
subscription_type         object
listening_time             int64
songs_played_per_day       int64
skip_rate                float64
device_type               object
ads_listened_per_week      int64
offline_listening          int64
is_churned                 int64
gender_Female              int64
gender_Male                int64
gender_Other               int64
dtype: object

In [20]:
# now similarly applying encoding on other data of object datatype

In [23]:
df['country'].value_counts()

country
AU    1034
US    1032
DE    1015
IN    1011
PK     999
FR     989
UK     966
CA     954
Name: count, dtype: int64

In [24]:
country_dummies = pd.get_dummies(df['country'],prefix='country',dtype=int) 
df = pd.concat([df, country_dummies], axis=1) 
df.drop('country', axis=1, inplace=True)

In [25]:
df.head()

Unnamed: 0,user_id,age,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned,...,gender_Male,gender_Other,country_AU,country_CA,country_DE,country_FR,country_IN,country_PK,country_UK,country_US
0,1,54,Free,26,23,0.2,Desktop,31,0,1,...,0,0,0,1,0,0,0,0,0,0
1,2,33,Family,141,62,0.34,Web,0,1,0,...,0,1,0,0,1,0,0,0,0,0
2,3,38,Premium,199,38,0.04,Mobile,0,1,1,...,1,0,1,0,0,0,0,0,0,0
3,4,22,Student,36,2,0.31,Mobile,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,5,29,Family,250,57,0.36,Mobile,0,1,1,...,0,1,0,0,0,0,0,0,0,1


In [26]:
df.dtypes

user_id                    int64
age                        int64
subscription_type         object
listening_time             int64
songs_played_per_day       int64
skip_rate                float64
device_type               object
ads_listened_per_week      int64
offline_listening          int64
is_churned                 int64
gender_Female              int64
gender_Male                int64
gender_Other               int64
country_AU                 int64
country_CA                 int64
country_DE                 int64
country_FR                 int64
country_IN                 int64
country_PK                 int64
country_UK                 int64
country_US                 int64
dtype: object

In [27]:
subscription_type_dummies = pd.get_dummies(df['subscription_type'],prefix='subscription_type',dtype=int) 
df = pd.concat([df, subscription_type_dummies], axis=1) 
df.drop('subscription_type', axis=1, inplace=True)

In [28]:
device_type_dummies = pd.get_dummies(df['device_type'],prefix='device_type',dtype=int) 
df = pd.concat([df, device_type_dummies], axis=1) 
df.drop('device_type', axis=1, inplace=True)

In [29]:
df.dtypes

user_id                        int64
age                            int64
listening_time                 int64
songs_played_per_day           int64
skip_rate                    float64
ads_listened_per_week          int64
offline_listening              int64
is_churned                     int64
gender_Female                  int64
gender_Male                    int64
gender_Other                   int64
country_AU                     int64
country_CA                     int64
country_DE                     int64
country_FR                     int64
country_IN                     int64
country_PK                     int64
country_UK                     int64
country_US                     int64
subscription_type_Family       int64
subscription_type_Free         int64
subscription_type_Premium      int64
subscription_type_Student      int64
device_type_Desktop            int64
device_type_Mobile             int64
device_type_Web                int64
dtype: object

In [30]:
df['skip_rate'].value_counts()

skip_rate
0.34    155
0.15    153
0.56    151
0.19    151
0.10    150
       ... 
0.02    115
0.23    114
0.18    113
0.60     72
0.00     67
Name: count, Length: 61, dtype: int64

In [31]:
# applying minmax scaling on skip rate (0 or 1)
from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler()

In [32]:
skip_rate_data = df[['skip_rate']] 
df['skip_rate_scaled'] = scaler.fit_transform(skip_rate_data)

In [33]:
df.head()

Unnamed: 0,user_id,age,listening_time,songs_played_per_day,skip_rate,ads_listened_per_week,offline_listening,is_churned,gender_Female,gender_Male,...,country_UK,country_US,subscription_type_Family,subscription_type_Free,subscription_type_Premium,subscription_type_Student,device_type_Desktop,device_type_Mobile,device_type_Web,skip_rate_scaled
0,1,54,26,23,0.2,31,0,1,1,0,...,0,0,0,1,0,0,1,0,0,0.333333
1,2,33,141,62,0.34,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0.566667
2,3,38,199,38,0.04,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0.066667
3,4,22,36,2,0.31,0,1,0,1,0,...,0,0,0,0,0,1,0,1,0,0.516667
4,5,29,250,57,0.36,0,1,1,0,0,...,0,1,1,0,0,0,0,1,0,0.6


In [34]:
df.dtypes

user_id                        int64
age                            int64
listening_time                 int64
songs_played_per_day           int64
skip_rate                    float64
ads_listened_per_week          int64
offline_listening              int64
is_churned                     int64
gender_Female                  int64
gender_Male                    int64
gender_Other                   int64
country_AU                     int64
country_CA                     int64
country_DE                     int64
country_FR                     int64
country_IN                     int64
country_PK                     int64
country_UK                     int64
country_US                     int64
subscription_type_Family       int64
subscription_type_Free         int64
subscription_type_Premium      int64
subscription_type_Student      int64
device_type_Desktop            int64
device_type_Mobile             int64
device_type_Web                int64
skip_rate_scaled             float64
d

In [35]:
# now applying standard scaling 

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
columns_to_scale = ['age', 'listening_time', 'songs_played_per_day'] 
scaler = StandardScaler() 
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [38]:
df.head()

Unnamed: 0,user_id,age,listening_time,songs_played_per_day,skip_rate,ads_listened_per_week,offline_listening,is_churned,gender_Female,gender_Male,...,country_UK,country_US,subscription_type_Family,subscription_type_Free,subscription_type_Premium,subscription_type_Student,device_type_Desktop,device_type_Mobile,device_type_Web,skip_rate_scaled
0,1,1.282452,-1.524434,-0.953574,0.2,31,0,1,1,0,...,0,0,0,1,0,0,1,0,0,0.333333
1,2,-0.365956,-0.155555,0.417349,0.34,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0.566667
2,3,0.026522,0.534836,-0.426296,0.04,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0.066667
3,4,-1.229408,-1.405401,-1.691763,0.31,0,1,0,1,0,...,0,0,0,0,0,1,0,1,0,0.516667
4,5,-0.679939,1.141904,0.24159,0.36,0,1,1,0,0,...,0,1,1,0,0,0,0,1,0,0.6


In [55]:
# Insert a new cell here (e.g., Cell 39_A)

# 1. Scale 'ads_listened_per_week' using the existing StandardScaler
df['ads_listened_per_week'] = scaler.fit_transform(df[['ads_listened_per_week']])

# 2. Drop the redundant/non-predictive columns
# Note: Keeping the scaled version: 'skip_rate_scaled'
# Drop one dummy from each set to avoid multicollinearity:
columns_to_drop = [
    'user_id', 
    'skip_rate', 
    'gender_Other',           # Drop one gender column
    'country_CA',             # Drop one country column
    'subscription_type_Free', # Drop one subscription column
    'device_type_Web'         # Drop one device column
]

df_final = df.drop(columns=columns_to_drop, axis=1)

# Now, update your X and y variables for the next cells (40-42)
X = df_final.drop("is_churned", axis=1)
y = df_final['is_churned']

In [56]:
# model creation 

In [57]:
# X = df.drop("is_churned",axis=1) 
# y= df['is_churned']

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)

In [60]:
from sklearn.metrics import accuracy_score,f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [61]:
# Modify Cell 44: Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    # Added class_weight='balanced'
    "logistic Regression": LogisticRegression(class_weight='balanced', random_state=42),
    "KNN": KNeighborsClassifier(), 
    # Naive Bayes does not support class weights
    "Naive Bayes": GaussianNB(), 
    # Added class_weight='balanced'
    "Decision Tree" : DecisionTreeClassifier(class_weight='balanced', random_state=42),
    # Added class_weight='balanced'
    "SVM (RBF Kernel)": SVC(probability=True, class_weight='balanced', random_state=42),
    # Added a powerful, imbalanced-friendly model
    "Random Forest (Balanced)": RandomForestClassifier(class_weight='balanced', random_state=42)
}

# Cell 45, 47, 48 (Results): Keep these cells as-is to see the improvement!

In [62]:
result=[]

In [63]:
for name , model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    result.append({
        'model': name,
        'Accuracy': round(acc,4), 
        'f1 score': round(f1, 4)
    })

In [64]:
result

[{'model': 'logistic Regression', 'Accuracy': 0.5219, 'f1 score': 0.362},
 {'model': 'KNN', 'Accuracy': 0.6906, 'f1 score': 0.1567},
 {'model': 'Naive Bayes', 'Accuracy': 0.75, 'f1 score': 0.0},
 {'model': 'Decision Tree', 'Accuracy': 0.6381, 'f1 score': 0.2605},
 {'model': 'SVM (RBF Kernel)', 'Accuracy': 0.5319, 'f1 score': 0.327},
 {'model': 'Random Forest (Balanced)', 'Accuracy': 0.7431, 'f1 score': 0.0096}]

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np

# --- 1. Define the Model and the Search Space ---

# Initialize the Logistic Regression model.
# Crucially, we keep the class_weight='balanced' to handle the imbalance.
log_reg = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)

# Define the parameter grid to search over.
# C: Inverse of regularization strength. Smaller C means stronger regularization.
# The search will test 10 values logarithmically spaced between 0.001 and 100.
param_grid = {
    'C': np.logspace(-3, 2, 10),  # e.g., [0.001, 0.0046, ..., 21.54, 100.0]
    'penalty': ['l1', 'l2']      # Test both L1 (Lasso) and L2 (Ridge) regularization
}

# --- 2. Initialize GridSearchCV ---

# We use the F1 score as the metric for optimization ('scoring').
# We use Stratified K-Fold Cross-Validation (default for classification in GridSearchCV)
# to ensure each fold has the same proportion of churned users.
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='f1',
    cv=5,                 # Use 5-fold cross-validation
    verbose=1,            # Print updates during the search
    n_jobs=-1             # Use all available CPU cores
)

# --- 3. Run the Grid Search ---

# X_train and y_train are assumed to be defined from your train_test_split
print("Starting Grid Search...")
grid_search.fit(X_train, y_train)
print("Grid Search Complete.")

# --- 4. Evaluate the Best Model ---

# Get the best parameters found
print("\nBest Parameters found by Grid Search:")
print(grid_search.best_params_)

# Get the best cross-validated F1 score
print("\nBest Cross-Validated F1 Score:")
print(f"{grid_search.best_score_:.4f}")

# Retrieve the best model
best_log_reg = grid_search.best_estimator_

# Test the best model on the unseen test set
y_pred_best = best_log_reg.predict(X_test)
final_f1 = f1_score(y_test, y_pred_best)
final_acc = best_log_reg.score(X_test, y_test)

print(f"\nFinal F1 Score on Test Set: {final_f1:.4f}")
print(f"Final Accuracy on Test Set: {final_acc:.4f}")

# View the confusion matrix for a deeper look
print("\nConfusion Matrix (Best Model on Test Set):")
print(confusion_matrix(y_test, y_pred_best))

Starting Grid Search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Grid Search Complete.

Best Parameters found by Grid Search:
{'C': np.float64(0.001), 'penalty': 'l2'}

Best Cross-Validated F1 Score:
0.3440

Final F1 Score on Test Set: 0.3577
Final Accuracy on Test Set: 0.4794

Confusion Matrix (Best Model on Test Set):
[[535 665]
 [168 232]]


In [66]:
pip install xgboost





In [67]:
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import numpy as np

# --- 1. Calculate the Class Imbalance Ratio ---

# Count the number of negative (0, Non-Churn) and positive (1, Churn) cases in the training data
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)

# Calculate the scale_pos_weight: Ratio of (Negative Samples / Positive Samples)
# This parameter tells XGBoost how much more important the minority class is.
scale_pos_weight_value = neg_count / pos_count

print(f"Non-Churn Samples (0): {neg_count}")
print(f"Churn Samples (1): {pos_count}")
print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.2f}")


# --- 2. Initialize and Train the XGBoost Model ---

# We use the XGBClassifier with the calculated weight and key parameters for imbalanced data
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',      # Standard objective for binary classification
    eval_metric='logloss',            # Metric for evaluation during training
    use_label_encoder=False,          # Suppress warning for an older setting
    scale_pos_weight=scale_pos_weight_value, # CRITICAL: Balances the classes
    n_estimators=1000,                # Number of boosting rounds (trees)
    learning_rate=0.01,               # Slower learning rate for better convergence
    max_depth=5,                      # Limit tree depth to prevent simple overfitting
    random_state=42,
    n_jobs=-1                         # Use all cores
)

print("\nStarting XGBoost Training...")
# Fit the model to the training data
xgb_model.fit(X_train, y_train)
print("XGBoost Training Complete.")


# --- 3. Evaluate the Model on the Test Set ---

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1] # Get probabilities for future tuning

# Calculate metrics
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_cm = confusion_matrix(y_test, y_pred_xgb)

print("\n--- XGBoost Results (Scale Position Weight Applied) ---")
print(f"Accuracy: {xgb_acc:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

print("\nConfusion Matrix:")
print(xgb_cm)

Non-Churn Samples (0): 4729
Churn Samples (1): 1671
Calculated scale_pos_weight: 2.83

Starting XGBoost Training...
XGBoost Training Complete.

--- XGBoost Results (Scale Position Weight Applied) ---
Accuracy: 0.5800
F1 Score: 0.3198

Confusion Matrix:
[[770 430]
 [242 158]]


In [68]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import f1_score
import numpy as np

# Assuming X_train, y_train are defined.

# 1. Re-calculate scale_pos_weight (just to be safe, should be 2.83)
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)
scale_pos_weight_value = neg_count / pos_count

# 2. Define the Model and the Search Space
xgb_base = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    # Fix the critical class imbalance parameter
    scale_pos_weight=scale_pos_weight_value,
    n_estimators=100,  # Use fewer trees for faster search
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

# Define the parameter grid to search over
param_grid = {
    # Test depths from 3 to 7
    'max_depth': [3, 5, 7],
    # Test gamma values (0=no regularization, 0.5 and 1.0 are common)
    'gamma': [0, 0.5, 1.0],
    # Test two common subsample rates
    'subsample': [0.8, 1.0]
}

# 3. Initialize GridSearchCV
# We keep the scoring metric as 'f1'
xgb_grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# 4. Run the Grid Search
print("Starting XGBoost Grid Search...")
xgb_grid_search.fit(X_train, y_train)
print("XGBoost Grid Search Complete.")

# 5. Evaluate the Best Model
best_xgb = xgb_grid_search.best_estimator_

print("\nBest Parameters found by XGBoost Grid Search:")
print(xgb_grid_search.best_params_)

print("\nBest Cross-Validated F1 Score:")
print(f"{xgb_grid_search.best_score_:.4f}")

# Test the best model on the unseen test set
y_pred_best_xgb = best_xgb.predict(X_test)
final_f1_xgb = f1_score(y_test, y_pred_best_xgb)
final_acc_xgb = accuracy_score(y_test, y_pred_best_xgb)

print(f"\nFinal F1 Score on Test Set: {final_f1_xgb:.4f}")
print(f"Final Accuracy on Test Set: {final_acc_xgb:.4f}")

print("\nConfusion Matrix (Best XGBoost Model on Test Set):")
print(confusion_matrix(y_test, y_pred_best_xgb))

Starting XGBoost Grid Search...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
XGBoost Grid Search Complete.

Best Parameters found by XGBoost Grid Search:
{'gamma': 1.0, 'max_depth': 3, 'subsample': 0.8}

Best Cross-Validated F1 Score:
0.3201

Final F1 Score on Test Set: 0.3221
Final Accuracy on Test Set: 0.5344

Confusion Matrix (Best XGBoost Model on Test Set):
[[678 522]
 [223 177]]


In [69]:
import joblib
from sklearn.preprocessing import StandardScaler
# Assuming 'best_log_reg' is your final trained Logistic Regression model
# Assuming 'scaler' is the StandardScaler object used for age, listening_time, etc.


# 1. Save the model object
joblib.dump(best_log_reg, 'churn_model.pkl')

# 2. Save the scaler object used for age, listening_time, songs_played_per_day, etc.
# Note: You must ensure you have a fit StandardScaler object.
# If you didn't save it before, you need to re-fit one on the training data.
# For simplicity, we assume 'scaler' is available:
joblib.dump(scaler, 'feature_scaler.pkl')

print("Model and Scaler saved successfully.")

Model and Scaler saved successfully.


In [70]:
print(model.feature_names_in_)

['age' 'listening_time' 'songs_played_per_day' 'ads_listened_per_week'
 'offline_listening' 'gender_Female' 'gender_Male' 'country_AU'
 'country_DE' 'country_FR' 'country_IN' 'country_PK' 'country_UK'
 'country_US' 'subscription_type_Family' 'subscription_type_Premium'
 'subscription_type_Student' 'device_type_Desktop' 'device_type_Mobile'
 'skip_rate_scaled']
