In [None]:
from PIL import Image
for i in range(1,21):
    file_path = f'/{i}.jpg'
    img=Image.open(file_path, mode='r')
    display(img)

# Code

In [None]:
import pandas as pd 
pd.set_option('display.max_columns', None) 
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
!pip install pySankey
from pySankey.sankey import sankey
from scipy.stats.contingency import association
from statsmodels.stats import proportion 
from scipy.stats import chi2_contingency
import shap
from category_encoders. count import CountEncoder
from sklearn.feature_selection import mutual_info_classif,SelectKBest
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler, PowerTransformer, KBinsDiscretizer, StandardScaler
from category_encoders. count import CountEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn-11-1-3/telco.csv')
df['Tenure in Years'] = pd.cut(x=df['Tenure in Months'], bins=range(0,73,12), labels=range(1,7,))
df[['Churn Category', 'Churn Reason']] = df[['Churn Category', 'Churn Reason']].fillna('not churned')
df[['Offer','Internet Type']] = df[['Offer','Internet Type']].fillna('None')
print(df.shape)
df.head()

In [None]:
target = 'Churn Label' # variables target  
zip_lat_long = ['Zip Code','Latitude','Longitude']

features = [ col for col in df.columns if col not in [target]+zip_lat_long] # features
numeric_features = df[features].select_dtypes(include=np.number).columns.tolist() # numeric variables
categoric_features = df[features].select_dtypes(exclude=np.number).columns.tolist() # categoric variables

print('target variable :', f'"{target}"')
print('number of features: ', len(features+zip_lat_long))

## Churn Proportion

In [None]:
bar_churn = df['Churn Label'].value_counts().to_frame().reset_index().set_axis(['is Churn','Count'], axis=1)
bar_churn['Percentage'] = [str(round((i/bar_churn['Count'].sum())*100,2))+' %' for i in bar_churn['Count']]

# dc7949
# 85999e
# 3A4C6A
fig = px.bar(
    bar_churn, 
    x = 'is Churn', 
    y = 'Count', 
    color_discrete_map={'Yes':'#dc7949','No':'#3A4C6A'}, 
    color='is Churn', 
    text_auto=False, 
    text=["""{} <br>({})""".format(v, p) for v,p in zip(bar_churn['Count'], bar_churn['Percentage'])]
             )
fig.update_layout(
    autosize=False,
    width=350,
    height=500,
        margin=dict(
        l=0,
        r=0,
        b=0,
        t=50,
        pad=0
        ),
    plot_bgcolor='white',
    showlegend=False
        )
fig.update_traces(textfont_size=13, textangle=0, cliponaxis=False, textposition="outside")
fig.show()

## Churn Reason

In [None]:
df_churn = df[df['Churn Label'] == 'Yes'].reset_index(drop=True)
churn_reason = df_churn[~df_churn['Churn Reason'].isin(["Don't know"])].groupby(['Churn Reason','Satisfaction Score'], as_index=False).count().iloc[:,:3].rename(columns={'Customer ID':'Count'})
churn_reason = pd.concat([churn_reason,pd.DataFrame({'Churn Reason':'Competitor had better devices','Satisfaction Score':3,'Count':0,'total churn reason':0}, index=[0])])
churn_reason['total churn reason']  = churn_reason['Churn Reason'].map(df_churn['Churn Reason'].value_counts().to_dict())
churn_reason=churn_reason.sort_values(by=['total churn reason','Satisfaction Score'], ascending=[False,True])
churn_reason['Satisfaction Score'] = churn_reason['Satisfaction Score'].astype(str)
churn_reason=churn_reason.reset_index(drop=True)
churn_reason=churn_reason[ churn_reason['total churn reason'].isin(pd.Series(churn_reason['total churn reason'].unique()).head(6).to_list())]

fig = px.bar(churn_reason, y="Churn Reason", x="Count", color="Satisfaction Score", color_discrete_sequence=['#DC7949','#E8A686','#F3D2C2'])

fig.update_layout(
    autosize=False,
    width=900,
    height=500,
        margin=dict(
        l=0,
        r=0,
        b=0,
        t=0,
        pad=0
        ),
    plot_bgcolor='white',
    showlegend=True,
     yaxis={'categoryorder':'total ascending'},
legend=dict(
    yanchor="top",
    y=0.35,
    xanchor="left",
    x=0.5
)
        )
fig.show()


## Churn Area

In [None]:
df_map = pd.merge( 
left= pd.merge(
        left=df[['Zip Code','City','Population','Longitude','Latitude']].drop_duplicates(),
        right= df_churn.groupby(by='Zip Code', as_index=False)['Churn Label'].count().rename(columns={'Churn Label':'Churn'}).drop_duplicates(), 
        on='Zip Code', 
        how='outer'),
right=df[df['Churn Label']=='No'].groupby(by='Zip Code', as_index=False)['Churn Label'].count().rename(columns={'Churn Label':'Not Churn'}),
on='Zip Code', 
how='outer'
).fillna(0)

df_map_table= df_map.groupby(by='City', as_index=False)['Churn'].sum().sort_values(by='Churn', ascending=False, ignore_index=True).rename(columns={'Churn':'Churn Rate'}).head(10)

fig = go.Figure(data=[go.Table( 
     columnwidth = [60,40],
    header=dict(values= ['City', 'Churn Count'], 
                fill_color='#3A4C6A',
                align='left',
                font=dict(color='white', size=15),
                height=40,
                line_color='#3A4C6A'
                ),
    cells=dict(values=[df_map_table[i] for i in df_map_table],
                fill_color='#f8f9fa',
                align='left',
                font=dict(color='#3A4C6A', size=15),
                height=40,
                line_color='#3A4C6A',
               ))
                     ])

fig.update_layout(
    autosize=False,
    width=300,
    height=440,
    margin=dict(l=0,r=0,b=0,t=0,pad=0),)
fig.show()

## Map

In [None]:
fig = px.density_mapbox(df_map, lat = 'Latitude', lon = 'Longitude', z = 'Churn',
                        radius = 8,
                        zoom = 4.4,
                        color_continuous_scale='rainbow',
                        mapbox_style = 'open-street-map'#'white-bg'#'carto-darkmatter''carto-positron'
                        )
fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    margin=dict(l=0,r=0,b=0,t=0,pad=0),
    plot_bgcolor='white')
fig.show()

## Telco Main Services
- Phone Service: Indicates if the customer subscribes to home phone service with the company: Yes, No
- Multiple Lines: Indicates if the customer subscribes to multiple telephone lines with the company: Yes, No
- Internet Service: Indicates if the customer subscribes to Internet service with the company: No, DSL, Fiber Optic, Cable

In [None]:
df[['Offer','Phone Service','Multiple Lines','Internet Service','Internet Type','Churn Category','Churn Reason','Churn Label']].head()

## Cramer's V

In [None]:
# function untuk membuat tabel contigency dan tabel expected value
def contigency_and_expected_value_table(series_1, series_2):
    df_i = pd.crosstab(series_1, series_2, margins=True) 
    bucket = []
    sum_ = df_i['All']['All']
    for i in df_i['All'].values[:-1]:
        for j in df_i.loc['All'].values[:-1]:
            bucket.append((i*j)/sum_)
    expected_val= pd.DataFrame(np.array(bucket).reshape(series_1.nunique(),series_2.nunique())).set_axis([df_i.columns[:-1]],axis=1).set_axis([df_i.index[:-1]],axis=0)
    degree_of_freedom = (series_1.nunique()-1) * (series_2.nunique()-1)
    return df_i, expected_val, degree_of_freedom

def chi_visualize(series_x, series_y=df['Churn Label'],width=300,height=450):
    con_table = pd.crosstab(series_x, series_y, margins=False, normalize='index')
    x = [
        [con_table.index.name]* con_table.shape[0],
        con_table.index.to_list()[::-1]
    ]
    y1 = con_table['Yes'].to_list()[::-1]
    y2 = con_table['No'].to_list()[::-1]

    fig = go.Figure()
    fig.add_bar(
        x=x,
        y=y1,
        marker_color='#DC7949',
        text= [f'{np.round(i*100,2)}%' for i in y1],
        name='Churn'
        )
    fig.add_bar(
        x=x,
        y=y2,
        marker_color = '#3A4C6A',
        text= [f'{np.round(i*100,2)}%' for i in y2],
        name='Not Churn'
        )
    fig.update_layout(
        barmode="relative",
        autosize=False,
        width=width,
        height=height,
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0,
            pad=0
            ),
        plot_bgcolor='white',
        showlegend=False,
        yaxis= dict(visible=False),
        font=dict(size=17)
            )
    fig.update_traces(textfont_size=17, textangle=0, cliponaxis=False, textposition="inside", textfont_color='white')
    fig.show()

In [None]:
for i in ['Phone Service','Multiple Lines','Internet Service']:
    chi_visualize(df[i])

In [None]:
association_var =['Phone Service','Multiple Lines','Internet Service']
dict_frame = {}
for j in association_var:
    result = association(pd.crosstab(df[j], df['Churn Label']), method="cramer")
    print(f'{j} :',result)

### Internet type vs churn

In [None]:
df_internet_type = df[df['Internet Type'] !='None'].reset_index(drop=True)
contigency_table = contigency_and_expected_value_table(df_internet_type['Internet Type'], df_internet_type['Churn Label'])
display(contigency_table[0])
display(contigency_table[1])

result = chi2_contingency(contigency_table[0].drop(columns=['All'], index=['All']))
print(result.pvalue)
if result.pvalue > 0.05:
    print('Fail to reject H0')
else:
    print('Reject H0')

In [None]:
chi_visualize(df_internet_type['Internet Type'], df_internet_type['Churn Label'])

## Internet Services Rating 

In [None]:
# data 
internet_type = df.groupby(by=['Internet Type', 'Satisfaction Score'], as_index=False).count().iloc[:,:3].rename(columns={'Customer ID':'Count'})
internet_type['Internet Type']=internet_type['Internet Type'].replace({'None':'Not Subscriber'})
internet_type_list =internet_type['Internet Type'].unique()
percent_dict = {}
for num in range(1,6):
    percent_list = []
    for i_type in internet_type_list:
        count_var = internet_type[(internet_type['Satisfaction Score']==num) & (internet_type['Internet Type']==i_type)]['Count'].values[0]
        sum_var = internet_type[internet_type['Internet Type']==i_type]['Count'].sum()
        percent = np.round((count_var / sum_var)*100,1)
        percent_list.append(percent)
    percent_dict[num] = percent_list

# viz

color_rating = ['#3A4C6A','#617088','#8994A6', '#B0B7C3','#D8DBE1']

def internet_viz(score):
    return internet_type[internet_type['Satisfaction Score']==score]['Count'].to_list()
fig = go.Figure(data=[ 
    go.Bar(
        name=f'{i}', 
        x=internet_type_list, 
        y=internet_viz(i), 
        text = [f'{i} %' for i in percent_dict[i]],
        marker_color=color
        
        ) for i, color in zip(range(1,6), color_rating)
    ])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    margin=dict(l=0,r=0,b=0,t=10,pad=0),
    plot_bgcolor='white',
    showlegend=True,
    yaxis={'categoryorder':'total ascending'},
    legend=dict(title='Customer Rating: ',yanchor="top",y=0.98,xanchor="left",x=0.03)
)

fig.update_traces(textfont_size=12, textangle=0, cliponaxis=False, textposition="outside", textfont_color='black')
fig.show()

## Phone Services Rating

In [None]:
# if phone service == yes & multiple lines == no . single line
# if phone service == yes & multiple lines == yes . single line
# if multiple lines == no . not subscriber 

bucket = []
for i,j in zip(df['Phone Service'],df['Multiple Lines']):
    if (i == 'Yes') & (j =='No'):
        bucket.append('Single Line')
    elif (i == 'Yes') & (j =='Yes'):
        bucket.append('Multi Lines')
    else:
        bucket.append('Not Subscriber')
df['Phone Service 2'] = bucket

ps = df.groupby(by=['Phone Service 2', 'Satisfaction Score'], as_index=False).count().iloc[:,:3].rename(columns={'Phone Service 2':'Phone Service','Customer ID':'Count','Satisfaction Score':'Customer Rating'})
ps = pd.concat([ ps[ps['Phone Service']==i] for i in ['Single Line','Multi Lines','Not Subscriber']], ignore_index=True)

ps_list =ps['Phone Service'].unique()
color_rating = ['#3A4C6A','#617088','#8994A6', '#B0B7C3','#D8DBE1']

percent_dict = {}
for num in range(1,6):
    percent_list = []
    for i_type in ps_list:
        count_var = ps[(ps['Customer Rating']==num) & (ps['Phone Service']==i_type)]['Count'].values[0]
        sum_var = ps[ps['Phone Service']==i_type]['Count'].sum()
        percent = np.round((count_var / sum_var)*100,1)
        percent_list.append(percent)
    percent_dict[num] = percent_list

def ps_viz(score):
    return ps[ps['Customer Rating']==score]['Count'].to_list()
fig = go.Figure(data=[ 
    go.Bar(
        name=f'{i}', 
        x=ps_list, 
        y=ps_viz(i), 
        text = [f'{i} %' for i in percent_dict[i]],
        marker_color=color
        ) for i, color in zip(range(1,6), color_rating)
    ])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    margin=dict(l=0,r=0,b=0,t=10,pad=0),
    plot_bgcolor='white',
    showlegend=True,
    yaxis={'categoryorder':'total ascending'},
    legend=dict(title='Customer Rating: ',yanchor="top",y=0.98,xanchor="left",x=0.85
)
)

fig.update_traces(textfont_size=12, textangle=0, cliponaxis=False, textposition="outside", textfont_color='black')
fig.show()

## Internet Service Options Analysis
- Online Security: Indicates if the customer subscribes to an additional online security service provided by the company: Yes, No
- Online Backup: Indicates if the customer subscribes to an additional online backup service provided by the company: Yes, No
- Device Protection Plan: Indicates if the customer subscribes to an additional device protection plan for their Internet equipment provided by the company: Yes, No
- Premium Tech Support: Indicates if the customer subscribes to an additional technical support plan from the company with reduced wait times: Yes, No
- Unlimited Data: Indicates if the customer has paid an additional monthly fee to have unlimited data downloads/uploads: Yes, No

In [None]:
additional = pd.read_csv('/kaggle/input/telco-customer-churn-11-1-3/telco.csv')[['Online Security','Online Backup','Device Protection Plan','Premium Tech Support','Unlimited Data','Churn Label']]
for i in ['Online Security','Online Backup','Device Protection Plan','Premium Tech Support','Unlimited Data']:
    additional[i]=additional[i].replace(to_replace='Yes',value=i)

bucket = []
bucket2 = []
for i in ['Online Security','Online Backup','Device Protection Plan','Premium Tech Support','Unlimited Data']:
    bucket.append(additional[additional[i]!='No'][[i,'Churn Label']].rename(columns={i:'Label'}))


sankey_data = pd.concat(bucket, axis=0, ignore_index=True)#.map({'No':'Not Churn','Yes':'Churn'})
sankey_data['Churn Label'] = sankey_data['Churn Label'].map({'No':'Not Churn','Yes':'Churn'})

color_palette = ['#9ca9b0','#e4e4e3','#df7a48', '#284256','#e4cec1']
colorDict = {
    'Online Security':'#9ca9b0',
    'Online Backup':'#e4e4e3',
    'Device Protection Plan':'#df7a48',
    'Premium Tech Support':'#284256',
    'Unlimited Data':'#e4cec1',
    'Churn':'#df3b57',
    'Not Churn':'#12562a'
}
sankey(sankey_data['Label'], sankey_data['Churn Label'] ,colorDict=colorDict,fontsize=10, aspect=100)

In [None]:
# association_var = ['Offer','Phone Service','Multiple Lines','Internet Service','Churn Label'] + ['Contract']
association_var =['Unlimited Data','Premium Tech Support','Device Protection Plan', 'Online Backup','Online Security','Satisfaction Score','Churn Label']
dict_frame = {}
for j in association_var:
    series_list = []
    for i in association_var: 
        result = association(pd.crosstab(df[j], df[i]), method="cramer")
        series_list.append(pd.Series(np.round(result,2), index=[i]))
    dict_frame[j] = pd.concat(series_list)
dict_frame = pd.DataFrame(dict_frame)
dict_frame

In [None]:
fig = px.imshow(dict_frame,
                text_auto = True,
                aspect="equal",
                color_continuous_scale = ['#284256','white','#df7a48']#'Cividis'
                )
fig.update_layout(
    autosize=False,
    width=700,
    height=600,
        margin=dict(
        l=0,
        r=0,
        b=0,
        t=0,
        pad=0
        ),
    plot_bgcolor='white',
)
fig.update_traces(textfont_size=17)
fig.show()

In [None]:
serv_ops = []
for i in ['Unlimited Data', 'Device Protection Plan', 'Online Backup', 'Online Security', 'Premium Tech Support']:
    group = df[df[i]=='Yes'].groupby(by=['Satisfaction Score'])[[i]].count()
    
    group[f'{i}_Percent']=[ f'{np.round((k/group[i].sum())*100,1)} %' for k in group[i]]
    serv_ops.append(group)
serv_ops = pd.concat(serv_ops, axis=1).reset_index()
serv_ops

In [None]:
vars = ['Unlimited Data', 'Device Protection Plan', 'Online Backup', 'Online Security', 'Premium Tech Support']
percent = ['Unlimited Data_Percent','Device Protection Plan_Percent','Online Backup_Percent','Online Security_Percent','Premium Tech Support_Percent']
color_rating = ['#3A4C6A','#617088','#8994A6', '#B0B7C3','#D8DBE1']
fig = go.Figure(data=[
    go.Bar(name='1', x=vars, y=serv_ops.loc[[0]][vars].values.tolist()[0], marker_color='#3A4C6A' , text=serv_ops.loc[[0]][percent].values.tolist()[0]),
    go.Bar(name='2', x=vars, y=serv_ops.loc[[1]][vars].values.tolist()[0], marker_color='#617088', text=serv_ops.loc[[1]][percent].values.tolist()[0]),
    go.Bar(name='3', x=vars, y=serv_ops.loc[[2]][vars].values.tolist()[0] , marker_color='#8994A6', text=serv_ops.loc[[2]][percent].values.tolist()[0]),
    go.Bar(name='4', x=vars, y=serv_ops.loc[[3]][vars].values.tolist()[0], marker_color='#B0B7C3' , text=serv_ops.loc[[3]][percent].values.tolist()[0]),
    go.Bar(name='5', x=vars, y=serv_ops.loc[[4]][vars].values.tolist()[0], marker_color='#D8DBE1', text=serv_ops.loc[[4]][percent].values.tolist()[0] ),
])

# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
        margin=dict(
        l=0,
        r=0,
        b=0,
        t=10,
        pad=0
        ),
    plot_bgcolor='white',
    showlegend=True,
     yaxis={'categoryorder':'total ascending'},
legend=dict(
    title='Customer Rating: ',
    yanchor="top",
    y=0.88,
    xanchor="left",
    x=0.83
)
)
fig.update_traces(textfont_size=12, textangle=0, cliponaxis=False, textposition="outside", textfont_color='black')
fig.show()

In [None]:
for i in ['Unlimited Data', 'Device Protection Plan', 'Online Backup', 'Online Security', 'Premium Tech Support']:
    chi_visualize(df[i],df['Churn Label'])
    display(df[df['Internet Service']=='Yes'][i].value_counts())

## Contract

In [None]:
chi_visualize(df_internet_type['Contract'], df_internet_type['Churn Label'], 450)

In [None]:
colorDict = {
    'Two Year':'#9ca9b0',
    'One Year':'#df7a48',
    'Month-to-Month':'#284256',
    'Churn':'#df3b57',
    'Not Churn':'#12562a'
}
sankey(df['Contract'], df['Churn Label'].replace({'Yes':'Churn','No':'Not Churn'}) ,colorDict=colorDict,fontsize=10, aspect=100)

## Offering

In [None]:

chi_visualize(df_internet_type['Offer'], df_internet_type['Churn Label'], 450, 450)

In [None]:
colorDict = {
    'Offer E':'#9ca9b0',
    'Offer D':'#e4e4e3',
    'Offer C':'#df7a48',
    'Offer B':'#284256',
    'Offer A':'#e4cec1',
    'None':'black',
    'Churn':'#df3b57',
    'Not Churn':'#12562a'
}

offer = df[df['Offer']!='None'].reset_index(drop=True)
sankey(offer['Offer'], offer['Churn Label'].replace({'Yes':'Churn','No':'Not Churn'}) ,colorDict=colorDict,fontsize=10, aspect=100)

## Total Revenue Loss

In [None]:
revenue = df.groupby(by='Churn Label', as_index=False)['Total Revenue'].sum().replace({'No':'Not Churn','Yes':'Churn'})
revenue['Percent'] = [np.round(i / revenue['Total Revenue'].sum()*100,2) for i in revenue['Total Revenue']] 

fig = px.pie(revenue, values = 'Total Revenue', names = 'Churn Label',color_discrete_sequence=['#3A4C6A','#df7a48'])
fig.update_layout(
    autosize=False,
    width=500,
    height=500,
        margin=dict(
        l=5,
        r=5,
        b=5,
        t=5,
        pad=0
        ),
    plot_bgcolor='white',
    showlegend=False
)
fig.update_traces(textfont_size=20,text=[f'{int(np.round(i)):,} $' for i in revenue['Total Revenue']],textinfo='percent+text')
fig.show()

# Modeling

In [None]:
df = pd.read_csv('/Users/alfathterry/ds/projek/projek pribadi/customer churn/telco.csv')
df['Tenure in Years'] = pd.cut(x=df['Tenure in Months'], bins=range(0,73,12), labels=range(1,7,))

df[['Churn Category', 'Churn Reason']] = df[['Churn Category', 'Churn Reason']].fillna('not churned')
df['Zip Code'] = df['Zip Code'].astype(str)
df['Tenure in Years'] = df['Tenure in Years'].astype(int)
count_encoder = CountEncoder()
df[['City','Zip Code']] = count_encoder.fit_transform(df[['City','Zip Code']]) 
print(df.shape)
df.head()

In [None]:
target = 'Churn Label' # variables target 
var_not_used = ['Customer ID','Country','State','Latitude','Longitude','Quarter','Customer Status','Churn Score','CLTV','Churn Category','Churn Reason']
features = [ col for col in df.columns if col not in [target]+var_not_used] # features
numeric_features = df[features].select_dtypes(include=np.number).columns.tolist() # numeric variables
categoric_features = df[features].select_dtypes(exclude=np.number).columns.tolist() # categoric variables

ordinal = ['Referred a Friend','Phone Service','Multiple Lines','Internet Service','Online Security','Online Backup','Device Protection Plan','Premium Tech Support','Streaming TV','Streaming Movies','Streaming Music','Unlimited Data']
nominal = ['Gender', 'Under 30', 'Senior Citizen','Married','Dependents','Offer', 'Internet Type','Contract','Paperless Billing','Payment Method']

## Preprocessing

In [None]:
ordinal_pipe = make_pipeline(OrdinalEncoder(categories=[['No','Yes']] * len(ordinal)), MinMaxScaler())
nominal_pipe = make_pipeline(OneHotEncoder(drop='first'))

numeric_pipe = make_pipeline(PowerTransformer('yeo-johnson'), MinMaxScaler())

preprocessing = ColumnTransformer(transformers=[
    ('ordinal_pipe',ordinal_pipe,ordinal),
    ('nominal_pipe',nominal_pipe,nominal),
    ('numeric_pipe',numeric_pipe,numeric_features),
])

## Cross Validation

In [None]:
# dataset splitting
y = df[target].replace({'Yes':1,'No':0})
X_train, X_test, y_train, y_test = train_test_split(df[features], y, test_size=0.15, shuffle=True,random_state=2024, stratify=y)
print(X_train.shape, X_test.shape)

In [None]:
models= {
    'RF':RandomForestClassifier(random_state=2024),
    'XGB': XGBClassifier(),
    'KNN':KNeighborsClassifier(),
    'CatBoost':CatBoostClassifier(verbose=0, random_seed=2024)
    }
result = []
for name,model in models.items():
    final_pipeline = make_pipeline(preprocessing, model)
    cv = cross_validate(final_pipeline, X_train, y_train, cv=5, return_train_score=True, scoring='roc_auc')
    result.append(pd.DataFrame( cv ).mean().to_frame().set_axis([name],axis=1))

In [None]:
score_result = pd.concat(result,axis=1).iloc[2:] 
score_result

## Model Evaluation

In [None]:
# report 
cat = make_pipeline(preprocessing,CatBoostClassifier(verbose=0, random_seed=2024))
cat.fit(X_train,y_train)
result = pd.DataFrame(dict(actual = y_test, pred=cat.predict(X_test), pred_proba=cat.predict_proba(X_test)[:,1]))

cf = pd.DataFrame(confusion_matrix(y_true=result['actual'],y_pred=result['pred'])).set_axis(['Not Churn','Churn'],axis=1).set_axis(['Not Churn','Churn'],axis=0)
fig = px.imshow(cf,
                text_auto = True,
                aspect="equal",
                color_continuous_scale = ['#284256','#df7a48']
                )
fig.update_layout(
    autosize=False,
    width=700,
    height=500,
        margin=dict(
        l=0,
        r=0,
        b=0,
        t=10,
        pad=0
        ),
    plot_bgcolor='white', 
    yaxis = dict(tickfont = dict(size=20)),
    xaxis = dict(tickfont = dict(size=20))
) 

fig.update_traces(textfont_size=60)
fig.show() 

In [None]:
# report untuk classification
def get_classification_report(y_test, y_pred, y_pred_proba_pos):

    ## Classification Report
    df_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose().reset_index().set_axis(['Attribute','Precision','Recall','F1-Score','Support'],axis=1).round(3).reset_index(drop=True)
    bucket = []
    for i,j in enumerate(df_report['Support']):
        if i == 2:
            bucket.append(str(j))
        else:
            bucket.append(str(int(j)))
    bucket2 = df_report.loc[2] 
    df_report['Support'] = bucket
    df_report['Attribute']=['0','1','Accurary','Macro Avg','Weighted Avg']
    df_report.loc[2] = ['Accuracy','','',bucket2[3],df_report['Support'].loc[4]]

    classification_report_viz = go.Figure(data=[go.Table(
        header=dict(values=list(df_report.columns),line_color='black',fill_color='#284256',align='center',font=dict(color='white', size=14),height=30),
        cells=dict(values=[df_report['Attribute'],df_report['Precision'], df_report['Recall'], df_report['F1-Score'], df_report['Support']],line_color='black',fill=dict(color=['#df7a48','#F4F4F2','#F4F4F2','#F4F4F2']),align='center',font_size=14,height=30))
    ]).update_layout(
        width=500, 
        height=240,
        margin=dict(l=10,r=10,b=10,t=10,pad=10), 
        )
    
    # ROC AUC Plot
    fpr, tpr, threshold = roc_curve(y_test, y_pred_proba_pos, pos_label=1)
    j = tpr - fpr 
    ix = np.argmax(j)
    best_threshold = threshold[ix]
    df_roc_auc = pd.DataFrame({'True Positive Rate':tpr,'False Positive Rate':fpr,'threshold':threshold})
    df_roc_auc['threshold'] =[round(i,4) for i in df_roc_auc['threshold']]
    auc_score = roc_auc_score(y_test, y_pred_proba_pos)
    roc_auc_viz = go.Figure()
    roc_auc_viz.add_shape(type='line', line=dict(dash='dash', color='black',width=3),x0=0, x1=1, y0=0, y1=1)
    roc_auc_viz.add_trace(go.Scatter(x=df_roc_auc['False Positive Rate'], y=df_roc_auc['True Positive Rate'],mode='lines',text=df_roc_auc['threshold'],
        hovertemplate=
            "<b>Threshold: %{text}</b><br>" +
            "<b>False Positive Rate: %{x:.4f}</b><br>" +
            "<b>True Positive Rate: %{y:.4f}</b><br>" +
            "<extra></extra>",
        marker = dict(color = '#df7a48', size=5),line=dict( width=3)))
    roc_auc_viz.update_layout(xaxis_title='False Positive Rate',yaxis_title='True Positive Rate',yaxis=dict(scaleanchor="x", scaleratio=1),xaxis=dict(constrain='domain'),width=750, height=550,plot_bgcolor='white',margin=dict(l=60,r=40,b=10,t=60,pad=10), title={'text': f"ROC-CURVE (AUC={round(auc_score,4)})",'y':0.96,'x':0.5,'xanchor': 'center','yanchor': 'top'})
    roc_auc_viz.update_xaxes(mirror=True,ticks='outside',showline=True,linecolor='black',gridcolor='lightgrey')
    roc_auc_viz.update_yaxes(mirror=True,ticks='outside',showline=True,linecolor='black',gridcolor='lightgrey')

    # Confusion Matrix
    colors = ['#3A4C6A','#df7a48'] 
    cmap1 = LinearSegmentedColormap.from_list("mycmap", colors)
    cm = confusion_matrix(y_test,y_pred,labels=[0,1]) 
    df_confusion_matrix = pd.DataFrame(cm).set_axis(['Not Churn','Churn'],axis=0).set_axis(['Not Churn','Churn'],axis=1)
    df_confusion_matrix.index.name = 'True Label'
    df_confusion_matrix.columns.name = 'Predicted Label'
    confusion_matrix_viz = px.imshow(img=df_confusion_matrix,text_auto=True,color_continuous_scale='Spectral').update_traces(
        hovertemplate = None,hoverinfo = "skip").update_layout(
            height=450,width=750,margin=dict(l=10,r=10,b=10,t=50,pad=0), title={'text': 'Confusion Matrix','y':0.96,'x':0.5,'xanchor': 'center','yanchor': 'top'},font=dict( size=15))
    return [confusion_matrix_viz,classification_report_viz,roc_auc_viz ,best_threshold]

In [None]:
get_classification_report(result['actual'],result['pred'],result['pred_proba'])[1]

In [None]:
get_classification_report(result['actual'],result['pred'],result['pred_proba'])[2]

## SHAP Summary

In [None]:
# shap values
colors = ['#3A4C6A','#df7a48'] 
cmap1 = LinearSegmentedColormap.from_list("mycmap", colors)

prep_fnames = [ i[14:]  for i in preprocessing.get_feature_names_out()]
shap.initjs()
cat = CatBoostClassifier(verbose=0)
X = pd.DataFrame(preprocessing.fit_transform(X_train), columns=[ i[14:]  for i in preprocessing.get_feature_names_out()])#[[ i for i in X.columns if i not in ['Online Security']]]
cat.fit(X,y_train)
explainer = shap.TreeExplainer(cat)
shap_values = explainer(X)

shap.summary_plot(shap_values=shap_values,show=False,color_bar=False, cmap=cmap1,max_display=10)
plt.colorbar()
plt.show()

## Business Simulation

In [None]:
revenue = pd.concat([X_test, y_test], axis=1).groupby(by='Churn Label', as_index=False)['Total Revenue'].sum().replace({'No':'Not Churn','Yes':'Churn'})
revenue['Percent'] = [f"{np.round(i / revenue['Total Revenue'].sum()*100,2)}%" for i in revenue['Total Revenue']]
revenue['Total Revenue after'] = [ np.round(i,2) for i in [ 3133915.6319999998,70099.97799999994]]
revenue['Percent after'] = [f"{np.round((i/revenue['Total Revenue after'].sum())*100,2)}%" for i in revenue['Total Revenue after']]

animals=['Before', 'After']
y1 = revenue[['Total Revenue','Total Revenue after']].loc[1].to_list()
t1 = [ f"${round(i):,}" for i in y1]
fig = go.Figure([go.Bar(x=animals, y=y1, text= t1, marker_color=['#df7a48','#df7a48'])])
fig.update_layout(
    autosize=False,
    width=330,
    height=400,
    margin=dict(l=0,r=0,b=0,t=40,pad=0),
    plot_bgcolor='white',
    showlegend=False,
    yaxis={'categoryorder':'total ascending'},
    legend=dict(title='Total Revenue: ',yanchor="top",y=0.98,xanchor="left",x=0.85)
)
fig.update_traces(textfont_size=20, textangle=0, cliponaxis=False, textposition="outside", textfont_color='black')
fig.show()