In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
import warnings 
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv('spotify_churn_dataset.csv')

In [13]:
df

Unnamed: 0,user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
0,1,Female,54,CA,Free,26,23,0.20,Desktop,31,0,1
1,2,Other,33,DE,Family,141,62,0.34,Web,0,1,0
2,3,Male,38,AU,Premium,199,38,0.04,Mobile,0,1,1
3,4,Female,22,CA,Student,36,2,0.31,Mobile,0,1,0
4,5,Other,29,US,Family,250,57,0.36,Mobile,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,7996,Other,44,DE,Student,237,36,0.30,Mobile,0,1,1
7996,7997,Male,34,AU,Premium,61,64,0.59,Mobile,0,1,0
7997,7998,Female,17,US,Free,81,62,0.33,Desktop,5,0,0
7998,7999,Female,34,IN,Student,245,94,0.27,Desktop,0,1,0


In [14]:
df.dtypes

user_id                    int64
gender                    object
age                        int64
country                   object
subscription_type         object
listening_time             int64
songs_played_per_day       int64
skip_rate                float64
device_type               object
ads_listened_per_week      int64
offline_listening          int64
is_churned                 int64
dtype: object

In [15]:
df.isnull().sum()

user_id                  0
gender                   0
age                      0
country                  0
subscription_type        0
listening_time           0
songs_played_per_day     0
skip_rate                0
device_type              0
ads_listened_per_week    0
offline_listening        0
is_churned               0
dtype: int64

In [16]:
# one hot encoding on gender column

In [17]:
gender_dummies = pd.get_dummies(df['gender'],prefix='gender',dtype=int) 
df = pd.concat([df, gender_dummies], axis=1) 
df.drop('gender', axis=1, inplace=True)

In [18]:
df.head()

Unnamed: 0,user_id,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned,gender_Female,gender_Male,gender_Other
0,1,54,CA,Free,26,23,0.2,Desktop,31,0,1,1,0,0
1,2,33,DE,Family,141,62,0.34,Web,0,1,0,0,0,1
2,3,38,AU,Premium,199,38,0.04,Mobile,0,1,1,0,1,0
3,4,22,CA,Student,36,2,0.31,Mobile,0,1,0,1,0,0
4,5,29,US,Family,250,57,0.36,Mobile,0,1,1,0,0,1


In [19]:
df.dtypes

user_id                    int64
age                        int64
country                   object
subscription_type         object
listening_time             int64
songs_played_per_day       int64
skip_rate                float64
device_type               object
ads_listened_per_week      int64
offline_listening          int64
is_churned                 int64
gender_Female              int64
gender_Male                int64
gender_Other               int64
dtype: object

In [20]:
# now similarly applying encoding on other data of object datatype

In [23]:
df['country'].value_counts()

country
AU    1034
US    1032
DE    1015
IN    1011
PK     999
FR     989
UK     966
CA     954
Name: count, dtype: int64

In [24]:
country_dummies = pd.get_dummies(df['country'],prefix='country',dtype=int) 
df = pd.concat([df, country_dummies], axis=1) 
df.drop('country', axis=1, inplace=True)

In [25]:
df.head()

Unnamed: 0,user_id,age,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned,...,gender_Male,gender_Other,country_AU,country_CA,country_DE,country_FR,country_IN,country_PK,country_UK,country_US
0,1,54,Free,26,23,0.2,Desktop,31,0,1,...,0,0,0,1,0,0,0,0,0,0
1,2,33,Family,141,62,0.34,Web,0,1,0,...,0,1,0,0,1,0,0,0,0,0
2,3,38,Premium,199,38,0.04,Mobile,0,1,1,...,1,0,1,0,0,0,0,0,0,0
3,4,22,Student,36,2,0.31,Mobile,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,5,29,Family,250,57,0.36,Mobile,0,1,1,...,0,1,0,0,0,0,0,0,0,1


In [26]:
df.dtypes

user_id                    int64
age                        int64
subscription_type         object
listening_time             int64
songs_played_per_day       int64
skip_rate                float64
device_type               object
ads_listened_per_week      int64
offline_listening          int64
is_churned                 int64
gender_Female              int64
gender_Male                int64
gender_Other               int64
country_AU                 int64
country_CA                 int64
country_DE                 int64
country_FR                 int64
country_IN                 int64
country_PK                 int64
country_UK                 int64
country_US                 int64
dtype: object

In [27]:
subscription_type_dummies = pd.get_dummies(df['subscription_type'],prefix='subscription_type',dtype=int) 
df = pd.concat([df, subscription_type_dummies], axis=1) 
df.drop('subscription_type', axis=1, inplace=True)

In [28]:
device_type_dummies = pd.get_dummies(df['device_type'],prefix='device_type',dtype=int) 
df = pd.concat([df, device_type_dummies], axis=1) 
df.drop('device_type', axis=1, inplace=True)

In [29]:
df.dtypes

user_id                        int64
age                            int64
listening_time                 int64
songs_played_per_day           int64
skip_rate                    float64
ads_listened_per_week          int64
offline_listening              int64
is_churned                     int64
gender_Female                  int64
gender_Male                    int64
gender_Other                   int64
country_AU                     int64
country_CA                     int64
country_DE                     int64
country_FR                     int64
country_IN                     int64
country_PK                     int64
country_UK                     int64
country_US                     int64
subscription_type_Family       int64
subscription_type_Free         int64
subscription_type_Premium      int64
subscription_type_Student      int64
device_type_Desktop            int64
device_type_Mobile             int64
device_type_Web                int64
dtype: object

In [30]:
df['skip_rate'].value_counts()

skip_rate
0.34    155
0.15    153
0.56    151
0.19    151
0.10    150
       ... 
0.02    115
0.23    114
0.18    113
0.60     72
0.00     67
Name: count, Length: 61, dtype: int64

In [31]:
# applying minmax scaling on skip rate (0 or 1)
from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler()

In [32]:
skip_rate_data = df[['skip_rate']] 
df['skip_rate_scaled'] = scaler.fit_transform(skip_rate_data)

In [33]:
df.head()

Unnamed: 0,user_id,age,listening_time,songs_played_per_day,skip_rate,ads_listened_per_week,offline_listening,is_churned,gender_Female,gender_Male,...,country_UK,country_US,subscription_type_Family,subscription_type_Free,subscription_type_Premium,subscription_type_Student,device_type_Desktop,device_type_Mobile,device_type_Web,skip_rate_scaled
0,1,54,26,23,0.2,31,0,1,1,0,...,0,0,0,1,0,0,1,0,0,0.333333
1,2,33,141,62,0.34,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0.566667
2,3,38,199,38,0.04,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0.066667
3,4,22,36,2,0.31,0,1,0,1,0,...,0,0,0,0,0,1,0,1,0,0.516667
4,5,29,250,57,0.36,0,1,1,0,0,...,0,1,1,0,0,0,0,1,0,0.6


In [34]:
df.dtypes

user_id                        int64
age                            int64
listening_time                 int64
songs_played_per_day           int64
skip_rate                    float64
ads_listened_per_week          int64
offline_listening              int64
is_churned                     int64
gender_Female                  int64
gender_Male                    int64
gender_Other                   int64
country_AU                     int64
country_CA                     int64
country_DE                     int64
country_FR                     int64
country_IN                     int64
country_PK                     int64
country_UK                     int64
country_US                     int64
subscription_type_Family       int64
subscription_type_Free         int64
subscription_type_Premium      int64
subscription_type_Student      int64
device_type_Desktop            int64
device_type_Mobile             int64
device_type_Web                int64
skip_rate_scaled             float64
d

In [35]:
# now applying standard scaling 

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
columns_to_scale = ['age', 'listening_time', 'songs_played_per_day'] 
scaler = StandardScaler() 
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [38]:
df.head()

Unnamed: 0,user_id,age,listening_time,songs_played_per_day,skip_rate,ads_listened_per_week,offline_listening,is_churned,gender_Female,gender_Male,...,country_UK,country_US,subscription_type_Family,subscription_type_Free,subscription_type_Premium,subscription_type_Student,device_type_Desktop,device_type_Mobile,device_type_Web,skip_rate_scaled
0,1,1.282452,-1.524434,-0.953574,0.2,31,0,1,1,0,...,0,0,0,1,0,0,1,0,0,0.333333
1,2,-0.365956,-0.155555,0.417349,0.34,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0.566667
2,3,0.026522,0.534836,-0.426296,0.04,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0.066667
3,4,-1.229408,-1.405401,-1.691763,0.31,0,1,0,1,0,...,0,0,0,0,0,1,0,1,0,0.516667
4,5,-0.679939,1.141904,0.24159,0.36,0,1,1,0,0,...,0,1,1,0,0,0,0,1,0,0.6


In [39]:
# model creation 

In [40]:
X = df.drop("is_churned",axis=1) 
y= df['is_churned']

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)

In [43]:
from sklearn.metrics import accuracy_score,f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [44]:
models = {
    "logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(), 
    "Naive Bayes": GaussianNB(), 
    "Decision Tree" : DecisionTreeClassifier() ,
    "SVM (RBF Kernel)": SVC(probability=True)
}

In [45]:
result=[]

In [47]:
for name , model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    result.append({
        'model': name,
        'Accuracy': round(acc,4), 
        'f1 score': round(f1, 4)
    })

In [48]:
result

[{'model': 'logistic Regression', 'Accuracy': 0.75, 'f1 score': 0.0},
 {'model': 'KNN', 'Accuracy': 0.6944, 'f1 score': 0.1525},
 {'model': 'Naive Bayes', 'Accuracy': 0.75, 'f1 score': 0.0},
 {'model': 'Decision Tree', 'Accuracy': 0.62, 'f1 score': 0.2675},
 {'model': 'SVM (RBF Kernel)', 'Accuracy': 0.75, 'f1 score': 0.0}]