In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
Churn_df = pd.read_csv('/content/streaming_data.csv')
Churn_df.head()

In [None]:
Churn_df.info()

In [None]:
Churn_df.describe()

In [None]:
Churn_df.isnull().sum()

In [None]:
plt.hist(Churn_df['Age'])

In [None]:
df_aux = Churn_df.groupby('Subscription_type').sum()

In [None]:
plt.bar(df_aux.index,df_aux['Churned'])

In [None]:
df_aux = Churn_df.groupby('Gender').sum()

In [None]:
plt.bar(df_aux.index,df_aux['Churned'])

In [None]:
df_aux = Churn_df.groupby('Devices_connected').sum()

In [None]:
plt.bar(df_aux.index,df_aux['Churned'])

In [None]:
plt.hist(Churn_df['Avg_rating'])

In [None]:
df_aux = Churn_df[['Avg_rating','Churned']].groupby('Churned').mean()

In [None]:
plt.bar(df_aux.index,df_aux['Avg_rating'])

In [None]:
plt.hist(Churn_df['Num_streaming_services'])

In [None]:
Churn_df['Time_on_platform'].fillna(0,inplace=True)

In [None]:
Churn_df['Num_streaming_services'].fillna(0,inplace = True)

In [None]:
Churn_df['Churned'].fillna(0,inplace = True)

In [None]:
Churn_df['Avg_rating'] = Churn_df['Avg_rating'].fillna(0)

In [None]:
Churn_df['Devices_connected']= Churn_df['Devices_connected'].fillna(0)

In [None]:
Churn_df.dropna(inplace = True)

In [None]:
Churn_df.isnull().sum()

In [None]:
Churn_df['Churned'] = Churn_df['Churned'].replace(to_replace='yes',value = 1)
Churn_df['Churned'] = Churn_df['Churned'].replace(to_replace='no',value = 0)

In [None]:
Churn_df.info()

In [None]:
Churn_df[['Age','Time_on_platform','Devices_connected','Num_streaming_services','Avg_rating']]=Churn_df[['Age','Time_on_platform','Devices_connected','Num_streaming_services','Avg_rating']].astype(int)

In [None]:
Le = LabelEncoder()

In [None]:
Churn_df['Subscription_type'] = Le.fit_transform(Churn_df['Subscription_type'])
Churn_df = Churn_df.drop('User_id',axis = 1)

In [None]:
Churn_df = pd.get_dummies(Churn_df,columns=['Gender'])

In [None]:
sns.heatmap(Churn_df.corr())

In [None]:
Scaler = MinMaxScaler()
Churn_df_scaled = Scaler.fit_transform(Churn_df)
Churn_df_scaled = pd.DataFrame(Churn_df_scaled,columns = Churn_df.columns)
Churn_df_scaled.head()

In [None]:
Churn_df_scaled.corr()['Churned']

In [None]:
X = Churn_df.drop(['Churned'],axis = 1)
y = Churn_df['Churned']
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
Lr = LogisticRegression()
Lr.fit(X_train,y_train)

In [None]:
y_pred_test_Lr = Lr.predict(X_test)

In [None]:
y_pred_train_Lr = Lr.predict(X_train)

In [None]:
Mc = confusion_matrix(y_test,y_pred_test_Lr)

In [None]:
ConfusionMatrixDisplay(Mc).plot()

In [None]:
Mc = confusion_matrix(y_train,y_pred_train_Lr)
ConfusionMatrixDisplay(Mc).plot()

In [None]:
print('Accuracy_test: ',accuracy_score(y_test,y_pred_test_Lr))
print('Accuracy_train: ',accuracy_score(y_train,y_pred_train_Lr))
print('Precision_test: ',precision_score(y_test,y_pred_test_Lr,pos_label = 1))
print('Precision_train: ',precision_score(y_train,y_pred_train_Lr,pos_label = 1))
print('Recall_test: ',recall_score(y_test,y_pred_test_Lr,pos_label = 1))
print('Recall_train: ',recall_score(y_train,y_pred_train_Lr,pos_label = 1))
print('F1_test: ',f1_score(y_test,y_pred_test_Lr,pos_label = 1))
print('F1_train: ',f1_score(y_train,y_pred_train_Lr,pos_label = 1))

In [None]:
Rf = RandomForestClassifier()
Rf.fit(X_train,y_train)
y_pred_test_Rf = Rf.predict(X_test)
y_pred_train_Rf = Rf.predict(X_train)
Mc = confusion_matrix(y_test,y_pred_test_Rf)
ConfusionMatrixDisplay(Mc).plot()
print('Accuracy_test: ',accuracy_score(y_test,y_pred_test_Rf))
print('Accuracy_train: ',accuracy_score(y_train,y_pred_train_Rf))
print('Precision_test: ',precision_score(y_test,y_pred_test_Rf,pos_label = 1))
print('Precision_train: ',precision_score(y_train,y_pred_train_Rf,pos_label = 1))
print('Recall_test: ',recall_score(y_test,y_pred_test_Rf,pos_label = 1))
print('Recall_train: ',recall_score(y_train,y_pred_train_Rf,pos_label = 1))
print('F1_test: ',f1_score(y_test,y_pred_test_Rf,pos_label = 1))
print('F1_train: ',f1_score(y_train,y_pred_train_Rf,pos_label = 1))

In [None]:
from sklearn.model_selection import GridSearchCV
params_Rf = {
    'n_estimators': [100,600,1000],
    'max_depth': [None,10, 20],
    'min_samples_split': [2, 5, 10],
}
Gs = GridSearchCV(param_grid = params_Rf,n_jobs = -1,estimator = Rf,cv=5,scoring = 'accuracy')
Gs.fit(X_train,y_train)

In [None]:
Gs.best_params_

In [None]:
Rf_tuned = RandomForestClassifier(max_depth = 20,
              min_samples_split = 2,
              n_estimators= 600)

In [None]:
Rf_tuned.fit(X_train,y_train)
y_pred_test_Rf_t = Rf.predict(X_test)
y_pred_train_Rf_t = Rf.predict(X_train)
Mc = confusion_matrix(y_test,y_pred_test_Rf_t)
ConfusionMatrixDisplay(Mc).plot()
print('Accuracy_test: ',accuracy_score(y_test,y_pred_test_Rf_t))
print('Accuracy_train: ',accuracy_score(y_train,y_pred_train_Rf_t))
print('Precision_test: ',precision_score(y_test,y_pred_test_Rf_t,pos_label = 1))
print('Precision_train: ',precision_score(y_train,y_pred_train_Rf_t,pos_label = 1))
print('Recall_test: ',recall_score(y_test,y_pred_test_Rf_t,pos_label = 1))
print('Recall_train: ',recall_score(y_train,y_pred_train_Rf_t,pos_label = 1))
print('F1_test: ',f1_score(y_test,y_pred_test_Rf_t,pos_label = 1))
print('F1_train: ',f1_score(y_train,y_pred_train_Rf_t,pos_label = 1))

In [None]:
params_Lr = {'penalty' : ['l1', 'l2', 'elasticnet', None],
             'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}
Gs_Lr = GridSearchCV(param_grid = params_Lr,n_jobs = -1,estimator = Lr,cv=5,scoring = 'accuracy')
Gs_Lr.fit(X_train,y_train)

In [None]:
Gs_Lr.best_params_

In [None]:
Lr_tuned = LogisticRegression(penalty = 'l1', solver = 'liblinear')

In [None]:
Lr_tuned.fit(X_train,y_train)
y_pred_test_Lr_t = Lr_tuned.predict(X_test)
y_pred_train_Lr_t = Lr_tuned.predict(X_train)
Mc = confusion_matrix(y_test,y_pred_test_Lr_t)
ConfusionMatrixDisplay(Mc).plot()
print('Accuracy_test: ',accuracy_score(y_test,y_pred_test_Lr_t))
print('Accuracy_train: ',accuracy_score(y_train,y_pred_train_Lr_t))
print('precision_test: ',precision_score(y_test,y_pred_test_Lr_t,pos_label = 1))
print('precision_train: ',precision_score(y_train,y_pred_train_Lr_t,pos_label = 1))
print('recall_test: ',recall_score(y_test,y_pred_test_Lr_t,pos_label = 1))
print('recall_train: ',recall_score(y_train,y_pred_train_Lr_t,pos_label = 1))
print('f1_test: ',f1_score(y_test,y_pred_test_Lr_t,pos_label = 1))
print('f1_train: ',f1_score(y_train,y_pred_train_Lr_t,pos_label = 1))