In [1]:
import joblib
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"D:\Ultimate Programming\Data Bases\Machine Learning Datasets\Classification\netflix_customer_churn.csv")

In [3]:
df.head(2)

Unnamed: 0,customer_id,age,gender,subscription_type,watch_hours,last_login_days,region,device,monthly_fee,churned,payment_method,number_of_profiles,avg_watch_time_per_day,favorite_genre
0,a9b75100-82a8-427a-a208-72f24052884a,51,Other,Basic,14.73,29,Africa,TV,8.99,1,Gift Card,1,0.49,Action
1,49a5dfd9-7e69-4022-a6ad-0a1b9767fb5b,47,Other,Standard,0.7,19,Europe,Mobile,13.99,1,Gift Card,5,0.03,Sci-Fi


In [4]:
df.drop('customer_id', axis=1, inplace=True)

In [5]:
df.head(1)

Unnamed: 0,age,gender,subscription_type,watch_hours,last_login_days,region,device,monthly_fee,churned,payment_method,number_of_profiles,avg_watch_time_per_day,favorite_genre
0,51,Other,Basic,14.73,29,Africa,TV,8.99,1,Gift Card,1,0.49,Action


In [6]:
df['payment_method'].value_counts()

payment_method
Debit Card     1030
PayPal         1026
Crypto          995
Gift Card       976
Credit Card     973
Name: count, dtype: int64

In [7]:
print(df.isnull().sum().sum())

0


In [8]:
df.describe()

Unnamed: 0,age,watch_hours,last_login_days,monthly_fee,churned,number_of_profiles,avg_watch_time_per_day
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,43.8474,11.64945,30.0898,13.6834,0.503,3.0244,0.8748
std,15.501128,12.014654,17.536078,3.692062,0.500041,1.415841,2.619824
min,18.0,0.01,0.0,8.99,0.0,1.0,0.0
25%,30.0,3.3375,15.0,8.99,0.0,2.0,0.11
50%,44.0,8.0,30.0,13.99,1.0,3.0,0.29
75%,58.0,16.03,45.0,17.99,1.0,4.0,0.72
max,70.0,110.4,60.0,17.99,1.0,5.0,98.42


In [9]:
x = df.drop('churned', axis=1)
y = df['churned']

In [10]:
x.head(1)

Unnamed: 0,age,gender,subscription_type,watch_hours,last_login_days,region,device,monthly_fee,payment_method,number_of_profiles,avg_watch_time_per_day,favorite_genre
0,51,Other,Basic,14.73,29,Africa,TV,8.99,Gift Card,1,0.49,Action


In [11]:
y.shape

(5000,)

In [12]:
x.shape

(5000, 12)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [14]:
x.columns

Index(['age', 'gender', 'subscription_type', 'watch_hours', 'last_login_days',
       'region', 'device', 'monthly_fee', 'payment_method',
       'number_of_profiles', 'avg_watch_time_per_day', 'favorite_genre'],
      dtype='object')

In [15]:
trf1 = ColumnTransformer(transformers=[
    ('gender', OneHotEncoder(drop='first', sparse_output=False), ['gender']),
    ('subs_type', OrdinalEncoder(), ['subscription_type']),
    ('region', OneHotEncoder(drop='first', sparse_output=False), ['region']),
    ('device', OrdinalEncoder(), ['device']),
    ('pay_meth', OneHotEncoder(drop='first', sparse_output=False), ['payment_method']),
    ('fav_genre', OneHotEncoder(drop='first', sparse_output=False), ['favorite_genre'])
],remainder='passthrough')

In [16]:
x_sc = trf1.fit_transform(x)
x_sc.shape

(5000, 25)

In [17]:
trf2 = ColumnTransformer(transformers=[
    ('scaled', MinMaxScaler(), slice(0, 25)) 
], remainder='passthrough')

In [18]:
from sklearn.linear_model import LogisticRegression
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

lr = LogisticRegression()

In [19]:
from sklearn.pipeline import make_pipeline, Pipeline

In [20]:
pipeline = make_pipeline(trf1, trf2, lr)

In [21]:
pipeline.fit(x_train, y_train)

In [22]:
from sklearn import set_config
set_config(display='diagram')

In [23]:
joblib.dump(pipeline, 'model.pkl')

['model.pkl']