In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/kaggle/input/spotify-dataset-for-churn-analysis/spotify_churn_dataset.csv')

print(df.head())

In [None]:
print(df.describe())

# Data Preprocessing

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
print("\nðŸ”¹ Data Types:\n", df.dtypes)


# EDA

In [None]:
# Value Counts (Categorical)
# ================================
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    print(f"\nValue Counts for {col}:")
    print(df[col].value_counts())

In [None]:
# Age Distribution

plt.figure(figsize=(7,4))
sns.histplot(df['age'], bins=20, kde=True, color="skyblue")
plt.title("Age Distribution of Users")
plt.show()

In [None]:
# 3. Gender Split
# ================================
plt.figure(figsize=(5,4))
sns.countplot(x="gender", data=df, palette="Set2")
plt.title("Gender Distribution")
plt.show()

In [None]:
# Country Distribution (Top 10)

plt.figure(figsize=(10,5))
df['country'].value_counts().head(10).plot(kind='bar', color="orange")
plt.title("Top 10 Countries by User Count")
plt.xlabel("Country")
plt.ylabel("Users")
plt.show()

In [None]:
#  Subscription Type

plt.figure(figsize=(6,4))
sns.countplot(x="subscription_type", data=df, palette="coolwarm")
plt.title("Subscription Type Distribution")
plt.show()

In [None]:
# Listening Time Distribution

plt.figure(figsize=(7,4))
sns.histplot(df['listening_time'], bins=30, kde=True, color="purple")
plt.title("Listening Time per Day (Minutes)")
plt.show()

In [None]:
# Device Type Usage

plt.figure(figsize=(6,4))
sns.countplot(x="device_type", data=df, palette="muted")
plt.title("Device Type Distribution")
plt.show()

In [None]:
# Ads Listened Distribution

plt.figure(figsize=(7,4))
sns.histplot(df['ads_listened_per_week'], bins=20, kde=False, color="green")
plt.title("Ads Listened per Week")
plt.show()

In [None]:
# 10. Offline Listening
# ================================
plt.figure(figsize=(5,4))
sns.countplot(x="offline_listening", data=df, palette="Set1")
plt.title("Offline Listening Usage")
plt.show()

# EDA Analysis for Churn


In [None]:
# Churn Rate

plt.figure(figsize=(5,4))
sns.countplot(x="is_churned", data=df, palette="Set2")
plt.title("Churn vs Active Users")
plt.xticks([0,1], ["Active (0)", "Churned (1)"])
plt.show()

In [None]:
# Age vs Churn

plt.figure(figsize=(7,4))
sns.boxplot(x="is_churned", y="age", data=df, palette="Set3")
plt.title("Age Distribution by Churn")
plt.xticks([0,1], ["Active", "Churned"])
plt.show()

In [None]:
# Gender vs Churn

plt.figure(figsize=(6,4))
sns.countplot(x="gender", hue="is_churned", data=df, palette="coolwarm")
plt.title("Gender Distribution by Churn")
plt.show()

In [None]:
# Subscription Type vs Churn

plt.figure(figsize=(7,4))
sns.countplot(x="subscription_type", hue="is_churned", data=df, palette="muted")
plt.title("Subscription Type vs Churn")
plt.show()

In [None]:
# Listening Time vs Churn

plt.figure(figsize=(7,4))
sns.boxplot(x="is_churned", y="listening_time", data=df, palette="Set2")
plt.title("Daily Listening Time vs Churn")
plt.xticks([0,1], ["Active", "Churned"])
plt.show()

In [None]:
# Songs Played per Day vs Churn

plt.figure(figsize=(7,4))
sns.boxplot(x="is_churned", y="songs_played_per_day", data=df, palette="Set1")
plt.title("Songs Played per Day vs Churn")
plt.xticks([0,1], ["Active", "Churned"])
plt.show()

In [None]:
# Skip Rate vs Churn

plt.figure(figsize=(7,4))
sns.boxplot(x="is_churned", y="skip_rate", data=df, palette="coolwarm")
plt.title("Skip Rate vs Churn")
plt.xticks([0,1], ["Active", "Churned"])
plt.show()

In [None]:
# Device Type vs Churn

plt.figure(figsize=(6,4))
sns.countplot(x="device_type", hue="is_churned", data=df, palette="Set3")
plt.title("Device Type vs Churn")
plt.show()

In [None]:
# Offline Listening vs Churn

plt.figure(figsize=(6,4))
sns.countplot(x="offline_listening", hue="is_churned", data=df, palette="Set1")
plt.title("Offline Listening vs Churn")
plt.show()

In [None]:
# Correlation with Churn (Numeric only)

plt.figure(figsize=(8,6))
sns.heatmap(df[['age','listening_time','songs_played_per_day','skip_rate','ads_listened_per_week','is_churned']].corr(),
            annot=True, cmap="Blues")
plt.title("Correlation with Churn")
plt.show()

# ML for Churn Analysis

In [None]:
cat_cols = ['gender', 'country', 'subscription_type', 'device_type']

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [None]:
# Train-Test Split
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = df.drop('is_churned', axis=1)   # sirf Churn drop karna hai
y = df['is_churned']

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

In [None]:
# Decision Tree

dt = DecisionTreeClassifier(random_state=42, class_weight="balanced")

dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

In [None]:
# Random Forest

# Random Forest
rf = RandomForestClassifier(random_state=42, class_weight="balanced")

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
# SVM
from sklearn.svm import SVC

svm = SVC(random_state=42, class_weight="balanced", probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

In [None]:
results = {
    "Logistic Regression": accuracy_score(y_test, y_pred_lr),
    "Decision Tree": accuracy_score(y_test, y_pred_dt),
    "Random Forest": accuracy_score(y_test, y_pred_rf),
    "SVM": accuracy_score(y_test, y_pred_svm)
}

print("Model Comparison:")
for model, acc in results.items():
    print(f"{model}: {acc:.4f}")

In [None]:
# ROC Comparison

from sklearn.metrics import roc_curve, roc_auc_score

# Models already trained: lr, dt, rf, svm
models = {
    "Logistic Regression": lr,
    "Decision Tree": dt,
    "Random Forest": rf,
    "SVM": svm
}

plt.figure(figsize=(8,6))

for name, model in models.items():
    # Predicted probabilities (for positive class = 1)
    y_prob = model.predict_proba(X_test)[:,1]
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    
    # Plot
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")

# Random guessing line
plt.plot([0,1], [0,1], 'k--')

plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Dictionary of models
models = {
    "Logistic Regression": lr,
    "Decision Tree": dt,
    "Random Forest": rf,
    "SVM": svm
}

# Empty list to store results
results = []

# Loop over models
for name, model in models.items():
    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    results.append([name, acc, prec, rec, f1, auc])

# Create DataFrame
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "AUC"])

print(results_df)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Dictionary of models (already trained before)
models = {
    "Logistic Regression": lr,
    "Decision Tree": dt,
    "Random Forest": rf,
    "SVM": svm
}

# --- 1. Cross Validation (5-fold) ---
print("==== Cross Validation Results ====")
for name, model in models.items():
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}: Mean Accuracy = {cv_scores.mean():.4f}, Scores = {cv_scores}")

# --- 2. Confusion Matrices ---
print("\n==== Confusion Matrices on Test Set ====")
for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# Overall Insights:
Accuracy is misleading â†’ high accuracy does not mean churn is detected.

Random Forest â†’ best overall accuracy but almost ignores churn.

Decision Tree / Logistic Regression / SVM â†’ sometimes detect churn but results inconsistent.

Cross-validation confirms â†’ Random Forest is stable for accuracy; others fluctuate.