### Task

- Load the data
- Drop the missing values
- Drop the duplicates
- Apply Feature selection
- Apply Label Encoder
- Divide the data into train test split
- Develop the DT algorithm with out and with hyper parameter tuning
- Save the model using Joblib
- Create a Flask end point and test on the post man
- Develop the stream lit application

**Import packages**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib 
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve

**Read the Dataset**

In [8]:
df=pd.read_csv(r'data_regression - data_regression.csv')
df

Unnamed: 0,year,customer_id,phone_no,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,2015,100198,409-8743,Female,36,62,no,no,148.35,12.2,16.81,82,1,4.0,1,0.0
1,2015,100643,340-5930,Female,39,149,no,no,294.45,7.7,33.37,87,3,3.0,2,0.0
2,2015,100756,372-3750,Female,65,126,no,no,87.30,11.9,9.89,91,1,4.0,5,1.0
3,2015,101595,331-4902,Female,24,131,no,yes,321.30,9.5,36.41,102,4,3.0,3,0.0
4,2015,101653,351-8398,Female,40,191,no,no,243.00,10.9,27.54,83,7,3.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2015,997132,385-7387,Female,54,75,no,yes,182.25,11.3,20.66,97,5,4.0,2,
1996,2015,998086,383-9255,Male,45,127,no,no,273.45,9.3,30.99,116,3,3.0,1,0.0
1997,2015,998474,353-2080,,53,94,no,no,128.85,15.6,14.60,110,16,5.0,0,0.0
1998,2015,998934,359-7788,Male,40,94,no,no,178.05,10.4,20.18,100,6,,3,0.0


**Cheak null and empty values**

In [11]:
df.isnull().sum()

year                       0
customer_id                0
phone_no                   0
gender                    24
age                        0
no_of_days_subscribed      0
multi_screen               0
mail_subscribed            0
weekly_mins_watched        0
minimum_daily_mins         0
maximum_daily_mins         0
weekly_max_night_mins      0
videos_watched             0
maximum_days_inactive     28
customer_support_calls     0
churn                     35
dtype: int64

**Drop values**

In [14]:
df.dropna(inplace=True)
df.reset_index(inplace=True)
df

Unnamed: 0,index,year,customer_id,phone_no,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,2015,100198,409-8743,Female,36,62,no,no,148.35,12.2,16.81,82,1,4.0,1,0.0
1,1,2015,100643,340-5930,Female,39,149,no,no,294.45,7.7,33.37,87,3,3.0,2,0.0
2,2,2015,100756,372-3750,Female,65,126,no,no,87.30,11.9,9.89,91,1,4.0,5,1.0
3,3,2015,101595,331-4902,Female,24,131,no,yes,321.30,9.5,36.41,102,4,3.0,3,0.0
4,4,2015,101653,351-8398,Female,40,191,no,no,243.00,10.9,27.54,83,7,3.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,1990,2015,993714,364-1969,Male,32,61,no,no,67.50,9.8,7.65,94,6,3.0,2,0.0
1914,1991,2015,993815,387-5891,Male,49,50,yes,yes,460.65,8.0,52.21,109,3,3.0,0,0.0
1915,1992,2015,994954,329-3222,Female,42,119,no,yes,176.70,7.6,20.03,76,3,3.0,3,0.0
1916,1996,2015,998086,383-9255,Male,45,127,no,no,273.45,9.3,30.99,116,3,3.0,1,0.0


In [16]:
df.isnull().sum()

index                     0
year                      0
customer_id               0
phone_no                  0
gender                    0
age                       0
no_of_days_subscribed     0
multi_screen              0
mail_subscribed           0
weekly_mins_watched       0
minimum_daily_mins        0
maximum_daily_mins        0
weekly_max_night_mins     0
videos_watched            0
maximum_days_inactive     0
customer_support_calls    0
churn                     0
dtype: int64

In [18]:
# change to int 
df['churn']=df['churn'].astype(int)
df

Unnamed: 0,index,year,customer_id,phone_no,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,2015,100198,409-8743,Female,36,62,no,no,148.35,12.2,16.81,82,1,4.0,1,0
1,1,2015,100643,340-5930,Female,39,149,no,no,294.45,7.7,33.37,87,3,3.0,2,0
2,2,2015,100756,372-3750,Female,65,126,no,no,87.30,11.9,9.89,91,1,4.0,5,1
3,3,2015,101595,331-4902,Female,24,131,no,yes,321.30,9.5,36.41,102,4,3.0,3,0
4,4,2015,101653,351-8398,Female,40,191,no,no,243.00,10.9,27.54,83,7,3.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,1990,2015,993714,364-1969,Male,32,61,no,no,67.50,9.8,7.65,94,6,3.0,2,0
1914,1991,2015,993815,387-5891,Male,49,50,yes,yes,460.65,8.0,52.21,109,3,3.0,0,0
1915,1992,2015,994954,329-3222,Female,42,119,no,yes,176.70,7.6,20.03,76,3,3.0,3,0
1916,1996,2015,998086,383-9255,Male,45,127,no,no,273.45,9.3,30.99,116,3,3.0,1,0


In [21]:
# drop coloumns 
df.columns

Index(['index', 'year', 'customer_id', 'phone_no', 'gender', 'age',
       'no_of_days_subscribed', 'multi_screen', 'mail_subscribed',
       'weekly_mins_watched', 'minimum_daily_mins', 'maximum_daily_mins',
       'weekly_max_night_mins', 'videos_watched', 'maximum_days_inactive',
       'customer_support_calls', 'churn'],
      dtype='object')

In [29]:
cols=['year','customer_id','phone_no']
df.drop(cols,inplace=True,axis=1)
df

Unnamed: 0,index,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,Female,36,62,no,no,148.35,12.2,16.81,82,1,4.0,1,0
1,1,Female,39,149,no,no,294.45,7.7,33.37,87,3,3.0,2,0
2,2,Female,65,126,no,no,87.30,11.9,9.89,91,1,4.0,5,1
3,3,Female,24,131,no,yes,321.30,9.5,36.41,102,4,3.0,3,0
4,4,Female,40,191,no,no,243.00,10.9,27.54,83,7,3.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,1990,Male,32,61,no,no,67.50,9.8,7.65,94,6,3.0,2,0
1914,1991,Male,49,50,yes,yes,460.65,8.0,52.21,109,3,3.0,0,0
1915,1992,Female,42,119,no,yes,176.70,7.6,20.03,76,3,3.0,3,0
1916,1996,Male,45,127,no,no,273.45,9.3,30.99,116,3,3.0,1,0


**Label Encoder**

In [32]:
le=LabelEncoder()
for i in df.select_dtypes(include='object'):
    df[i]=le.fit_transform(df[i])
df

Unnamed: 0,index,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,churn
0,0,0,36,62,0,0,148.35,12.2,16.81,82,1,4.0,1,0
1,1,0,39,149,0,0,294.45,7.7,33.37,87,3,3.0,2,0
2,2,0,65,126,0,0,87.30,11.9,9.89,91,1,4.0,5,1
3,3,0,24,131,0,1,321.30,9.5,36.41,102,4,3.0,3,0
4,4,0,40,191,0,0,243.00,10.9,27.54,83,7,3.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,1990,1,32,61,0,0,67.50,9.8,7.65,94,6,3.0,2,0
1914,1991,1,49,50,1,1,460.65,8.0,52.21,109,3,3.0,0,0
1915,1992,0,42,119,0,1,176.70,7.6,20.03,76,3,3.0,3,0
1916,1996,1,45,127,0,0,273.45,9.3,30.99,116,3,3.0,1,0


**Divide the data into X and y**

In [36]:
X=df.drop('churn',axis=1)
y=df['churn']

In [38]:
X

Unnamed: 0,index,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls
0,0,0,36,62,0,0,148.35,12.2,16.81,82,1,4.0,1
1,1,0,39,149,0,0,294.45,7.7,33.37,87,3,3.0,2
2,2,0,65,126,0,0,87.30,11.9,9.89,91,1,4.0,5
3,3,0,24,131,0,1,321.30,9.5,36.41,102,4,3.0,3
4,4,0,40,191,0,0,243.00,10.9,27.54,83,7,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,1990,1,32,61,0,0,67.50,9.8,7.65,94,6,3.0,2
1914,1991,1,49,50,1,1,460.65,8.0,52.21,109,3,3.0,0
1915,1992,0,42,119,0,1,176.70,7.6,20.03,76,3,3.0,3
1916,1996,1,45,127,0,0,273.45,9.3,30.99,116,3,3.0,1


In [40]:
y

0       0
1       0
2       1
3       0
4       0
       ..
1913    0
1914    0
1915    0
1916    0
1917    1
Name: churn, Length: 1918, dtype: int64

**Feature Selection**

- VarianceThreshold

In [44]:
vt=VarianceThreshold(threshold=0)
vt.fit(X)
vt.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [48]:
vt.variances_

array([1.99900000e+03, 2.48955072e-01, 6.40000000e+01, 2.42000000e+02,
       9.09163068e-02, 2.03857914e-01, 5.26200000e+02, 7.67553942e+00,
       5.96400000e+01, 1.33000000e+02, 6.13611133e+00, 6.49040265e-01,
       1.69055711e+00])

In [52]:
cols=vt.get_feature_names_out()
X[cols]

Unnamed: 0,index,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls
0,0,0,36,62,0,0,148.35,12.2,16.81,82,1,4.0,1
1,1,0,39,149,0,0,294.45,7.7,33.37,87,3,3.0,2
2,2,0,65,126,0,0,87.30,11.9,9.89,91,1,4.0,5
3,3,0,24,131,0,1,321.30,9.5,36.41,102,4,3.0,3
4,4,0,40,191,0,0,243.00,10.9,27.54,83,7,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,1990,1,32,61,0,0,67.50,9.8,7.65,94,6,3.0,2
1914,1991,1,49,50,1,1,460.65,8.0,52.21,109,3,3.0,0
1915,1992,0,42,119,0,1,176.70,7.6,20.03,76,3,3.0,3
1916,1996,1,45,127,0,0,273.45,9.3,30.99,116,3,3.0,1


**Split the data into train and test**

In [57]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1534, 13)
(384, 13)
(1534,)
(384,)


**Implement the model**

In [62]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)

**Make predictions**

In [65]:
y_pred=dt.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,

**New Dataframe**

In [68]:
df1=pd.DataFrame(X_test)
df1['y']=y
df1['y_prediction']=y_pred
df1

Unnamed: 0,index,gender,age,no_of_days_subscribed,multi_screen,mail_subscribed,weekly_mins_watched,minimum_daily_mins,maximum_daily_mins,weekly_max_night_mins,videos_watched,maximum_days_inactive,customer_support_calls,y,y_prediction
361,407,0,36,127,0,0,399.90,5.9,45.32,119,2,2.0,1,1,0
1550,1620,1,37,107,0,1,171.45,7.5,19.43,74,8,3.0,1,0,0
1246,1311,0,22,140,0,0,180.45,7.7,20.45,74,3,3.0,4,1,1
1799,1875,1,30,66,0,0,131.40,9.2,14.89,125,5,3.0,1,0,0
427,476,0,30,134,0,1,214.35,10.8,24.29,96,6,3.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,234,0,34,75,0,0,221.25,16.4,25.08,68,3,5.0,2,0,0
918,983,0,42,12,0,0,374.40,11.8,42.43,90,3,4.0,1,1,0
219,253,1,30,139,0,0,207.15,8.3,23.48,63,2,3.0,1,0,0
516,570,1,26,62,1,0,239.55,13.9,27.15,105,6,4.0,0,1,1


**model evalution**

In [71]:
acc_dt=round(accuracy_score(y_test,y_pred),2)
pr_dt=round(precision_score(y_test,y_pred),2)
rc_dt=round(recall_score(y_test,y_pred),2)
f1_dt=round(f1_score(y_test,y_pred),2)

In [73]:
print('accuracy score',acc_dt)
print('precision score',pr_dt)
print('recall score',rc_dt)
print('f1 score',f1_dt)

accuracy score 0.9
precision score 0.52
recall score 0.67
f1 score 0.58


**Save the model**

In [78]:
joblib.dump(dt,open('Decision_tree_model.pkl','wb'))