In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter
from lifelines import WeibullAFTFitter
import numpy as np


# Data Formulation Part 


df=pd.read_csv('tele_churn.csv')

## Convert TotalCharges to numeric
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

## Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data.
df['Churn']=df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0 )

## Impute the null value with the median value

df.TotalCharges.fillna(value=df['TotalCharges'].median(),inplace=True)


## Create a list of Categorical Columns
cat_cols= [i  for i in df.columns if df[i].dtype==object]
cat_cols.remove('customerID')  ## customerID has been removed because it is unique for all the rows.

# Ensure 'tenure' has no non-positive values
df['tenure'] = df['tenure'].apply(lambda x: x if x > 0 else 0.01)

durations = df['tenure'] ## Time to event data of censored and event data
event_observed = df['Churn']  ## It has the churned (1) and censored is (0)

df_r= df.loc[:,['tenure','Churn','gender','Partner','Dependents','PhoneService','MonthlyCharges','SeniorCitizen','StreamingTV']]

## Create dummy variables
df_dummy = pd.get_dummies(df_r, drop_first=True, dtype=int)
df_dummy.head(100)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.TotalCharges.fillna(value=df['TotalCharges'].median(),inplace=True)


Unnamed: 0,tenure,Churn,MonthlyCharges,SeniorCitizen,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,StreamingTV_No internet service,StreamingTV_Yes
0,1.0,0,29.85,0,0,1,0,0,0,0
1,34.0,0,56.95,0,1,0,0,1,0,0
2,2.0,1,53.85,0,1,0,0,1,0,0
3,45.0,0,42.30,0,1,0,0,0,0,0
4,2.0,1,70.70,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
95,12.0,1,78.95,0,0,0,0,1,0,0
96,71.0,0,66.85,0,1,1,1,1,0,0
97,5.0,1,21.05,0,1,0,0,1,1,0
98,52.0,0,21.00,0,1,0,0,1,1,0


In [20]:
# Fitting in WEibull AFT Model

aft=WeibullAFTFitter()
aft.fit(df_dummy, 'tenure', event_col='Churn')

#Fitting in Cox Proportional Hazard Model

cph = CoxPHFitter()
cph.fit(df_dummy, 'tenure', event_col='Churn')


aft.print_summary()
cph.print_summary()

0,1
model,lifelines.WeibullAFTFitter
duration col,'tenure'
event col,'Churn'
number of observations,7043
number of events observed,1869
log-likelihood,-10101.24
time fit was run,2025-02-06 08:11:21 UTC

Unnamed: 0,Unnamed: 1,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
lambda_,Dependents_Yes,0.51,1.67,0.1,0.32,0.7,1.38,2.02,0.0,5.24,<0.005,22.59
lambda_,MonthlyCharges,0.02,1.02,0.0,0.01,0.02,1.01,1.02,0.0,6.49,<0.005,33.43
lambda_,Partner_Yes,1.14,3.13,0.08,0.99,1.29,2.69,3.64,0.0,14.76,<0.005,161.32
lambda_,PhoneService_Yes,-1.02,0.36,0.15,-1.31,-0.73,0.27,0.48,0.0,-6.88,<0.005,37.31
lambda_,SeniorCitizen,-0.55,0.57,0.08,-0.71,-0.4,0.49,0.67,0.0,-6.99,<0.005,38.4
lambda_,StreamingTV_No internet service,3.02,20.42,0.19,2.64,3.39,13.99,29.8,0.0,15.64,<0.005,180.75
lambda_,StreamingTV_Yes,0.26,1.3,0.09,0.09,0.44,1.1,1.55,0.0,3.04,<0.005,8.71
lambda_,gender_Male,0.01,1.01,0.07,-0.12,0.14,0.89,1.15,0.0,0.18,0.86,0.22
lambda_,Intercept,3.94,51.17,0.15,3.65,4.23,38.29,68.39,0.0,26.6,<0.005,515.34
rho_,Intercept,-0.35,0.7,0.02,-0.39,-0.31,0.68,0.73,0.0,-17.36,<0.005,221.89

0,1
Concordance,0.71
AIC,20222.48
log-likelihood ratio test,950.13 on 8 df
-log2(p) of ll-ratio test,661.27


0,1
model,lifelines.CoxPHFitter
duration col,'tenure'
event col,'Churn'
baseline estimation,breslow
number of observations,7043
number of events observed,1869
partial log-likelihood,-15182.39
time fit was run,2025-02-06 08:11:22 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
MonthlyCharges,-0.01,0.99,0.0,-0.01,-0.01,0.99,0.99,0.0,-6.13,<0.005,30.06
SeniorCitizen,0.4,1.49,0.06,0.29,0.5,1.33,1.66,0.0,7.16,<0.005,40.15
gender_Male,-0.01,0.99,0.05,-0.1,0.08,0.9,1.08,0.0,-0.23,0.82,0.29
Partner_Yes,-0.81,0.45,0.05,-0.92,-0.7,0.4,0.5,0.0,-14.93,<0.005,164.98
Dependents_Yes,-0.36,0.7,0.07,-0.49,-0.22,0.61,0.8,0.0,-5.21,<0.005,22.37
PhoneService_Yes,0.69,2.0,0.1,0.49,0.9,1.63,2.45,0.0,6.65,<0.005,34.96
StreamingTV_No internet service,-2.1,0.12,0.13,-2.36,-1.84,0.09,0.16,0.0,-15.79,<0.005,184.09
StreamingTV_Yes,-0.19,0.83,0.06,-0.31,-0.07,0.73,0.93,0.0,-3.1,<0.005,9.03

0,1
Concordance,0.71
Partial AIC,30380.78
log-likelihood ratio test,941.30 on 8 df
-log2(p) of ll-ratio test,654.95


In [5]:
#Data for analysis - 
data_1= {
    'tenure': [1],
    'MonthlyCharges': [29.85],                       #Non-Churn Data
    'SeniorCitizen': [0],
    'gender_Male': [0],
    'Partner_Yes': [1],
    'Dependents_Yes': [0],
    'PhoneService_Yes': [0],
    'StreamingTV_No internet service': [0],
    'StreamingTV_Yes': [0],
}

original_df=pd.DataFrame(data_1)

In [8]:
after_aft = aft.predict_expectation(original_df)
print(after_aft)

0    327.676856
dtype: float64


In [9]:
# Predictions and analysis functions
def predict_cumulative_hazard(X, times):
    return cph.predict_cumulative_hazard(X, times=times)

def predict_expectation(X):
    return cph.predict_expectation(X)

def predict_log_partial_hazard(X):
    return cph.predict_log_partial_hazard(X)

def predict_partial_hazard(X):
    return cph.predict_partial_hazard(X)

def predict_median(X):
    return cph.predict_median(X)

def predict_percentile(X, p):
    return cph.predict_percentile(X, p)

def predict_survival_function(X, times):
    return cph.predict_survival_function(X, times=times)

def score_model(df):
    return cph.score(df)

# Example usage with reference data and specific times
times = [10, 20, 30, 40]

print("Cumulative Hazard:\n", predict_cumulative_hazard(original_df, times))
print("Expected Time:\n", predict_expectation(original_df))
print("Log Partial Hazard:\n", predict_log_partial_hazard(original_df))
print("Partial Hazard:\n", predict_partial_hazard(original_df))
print("Median Time:\n", predict_median(original_df))
print("25th Percentile Time:\n", predict_percentile(original_df, 0.25))
print("Survival Function:\n", predict_survival_function(original_df, times))
#print("Model Score:\n", score_model(df_dummy))

Cumulative Hazard:
              0
10.0  0.109873
20.0  0.157748
30.0  0.195713
40.0  0.235789
Expected Time:
 0    57.767926
dtype: float64
Log Partial Hazard:
 0   -0.086866
dtype: float64
Partial Hazard:
 0    0.9168
dtype: float64
Median Time:
 inf
25th Percentile Time:
 inf
Survival Function:
              0
10.0  0.895948
20.0  0.854065
30.0  0.822249
40.0  0.789948


In [19]:
churn_prob= (1 - predict_survival_function(original_df, times))*100




In [11]:
print("Churn Probability: ", churn_prob)

Churn Probability:  10.0    10.405219
20.0    14.593469
30.0    17.775150
40.0    21.005248
Name: 0, dtype: float64


In [None]:
#Baseline metrices for comparison - 


dict1={'avg_monthly_charges': df_dummy['MonthlyCharges'].mean(),
    'churn_rate': (df_dummy['Churn'] == 'Yes').mean() * 100,
    'avg_tenure': df_dummy['tenure'].mean(),
    'service_adoption': {
        'internet': (df_dummy['InternetService'] != 'No').mean() * 100,
        'phone': (df_dummy['PhoneService'] == 'Yes').mean() * 100,
        'security': (df_dummy['OnlineSecurity'] == 'Yes').mean() * 100,
        'backup': (df_dummy['OnlineBackup'] == 'Yes').mean() * 100,
        'protection': (df_dummy['DeviceProtection'] == 'Yes').mean() * 100,
        'support': (df_dummy['TechSupport'] == 'Yes').mean() * 100}}

In [101]:
churn_prob = (1 - cph.predict_survival_function(original_df, times=[3, 6, 12])) * 100  # Predict churn probability at 3, 6, and 12 months

print("\n --- Customer Risk Assessment ---")
print(f"  - Customer Profile: ABX121") # Changed f-string here
print("  - Predicted Churn Probability:")
print(f"    - Within 3 Months: {churn_prob.iloc[0][0]:.2f}%")  # Probability at 3 months
print(f"    - Within 6 Months: {churn_prob.iloc[1][0]:.2f}%") # Probability at 6 months
print(f"    - Within 12 Months: {churn_prob.iloc[2][0]:.2f}%")# Probability at 12 months


partial_function1=cph.predict_partial_hazard(original_df)
# print(partial_function1)
partial_function=float(partial_function1.iloc[0])
#print(partial_function,type(partial_function))

# Get the whole number and decimal parts
whole_number = int(partial_function)
decimal_part = partial_function - whole_number
#print(decimal_part)

if partial_function < 1:
    print(" ")


elif whole_number == 1:
    # For partial_functions between 1 and 2, take only decimal part
    
    percentage = decimal_part * 100
    cd = f"{percentage:.1f}%"
    print("The customer has ",cd," higher instantaneous risk of churning at any given time compared to the average customer")
else:
    # For partial_functions > 2, subtract 1 from whole number and add decimal
    percentage = ((whole_number - 1) * 100) + (decimal_part * 100)
    
    cd = f"{percentage:.1f}%"

    print("The customer has ",cd," higher instantaneous risk of churning at any given time compared to the average customer")


# AFT Model Averag time survival - 


estimate_Survival=aft.predict_expectation(original_df)
estimate_Survival=aft.predict_median(original_df)
print("Custoer is likely to stay for ",int(estimate_Survival.iloc[0])," months")



print(cph.predict_percentile(original_df, 0.75))


 --- Customer Risk Assessment ---
  - Customer Profile: ABX121
  - Predicted Churn Probability:
    - Within 3 Months: 5.88%
    - Within 6 Months: 8.06%
    - Within 12 Months: 11.35%
 
Custoer is likely to stay for  154  months
53.0


In [54]:
risk_threshold = 40

# Predict survival function for all customers at a specific time (e.g., 12 months)
survival_probs2 = cph.predict_survival_function(df_dummy, times=[12])
print(survival_probs2)
# Calculate churn probabilities by subtracting survival probabilities from 1
churn_probs = (1 - survival_probs2) * 100 

#print(churn_probs)
# Identify customers at high risk of churning
high_risk_customers = churn_probs.columns[churn_probs.loc[12.0] > risk_threshold]

print('\nNumber of Customers exceeding set Churn Risk :',len(high_risk_customers))
print("\nHigh-Risk Customers (Churn probability > 50% within 12 months):")
print(high_risk_customers)



          0         1         2     ...      7040     7041     7042
12.0  0.886471  0.671521  0.662421  ...  0.918831  0.80411  0.82377

[1 rows x 7043 columns]

Number of Customers exceeding set Churn Risk : 90

High-Risk Customers (Churn probability > 50% within 12 months):
Index([  34,  244,  327,  340,  356,  392,  451,  571,  649,  687,  950, 1069,
       1325, 1417, 1563, 1572, 1639, 1731, 1782, 1834, 1856, 1891, 1966, 1995,
       2037, 2042, 2043, 2133, 2278, 2296, 2366, 2422, 2517, 2529, 2548, 2607,
       2641, 2829, 2835, 2970, 2979, 3301, 3341, 3369, 3374, 3526, 3595, 3598,
       3633, 3647, 3689, 3766, 3893, 3917, 3943, 3999, 4053, 4161, 4195, 4299,
       4341, 4342, 4382, 4529, 4541, 4680, 4762, 4842, 4851, 4879, 4997, 5165,
       5251, 5368, 5437, 5573, 5782, 5822, 5972, 6096, 6134, 6191, 6273, 6301,
       6345, 6397, 6673, 6726, 6845, 6956],
      dtype='int64')


In [56]:
revenue_at_risk = df_dummy.loc[high_risk_customers, 'MonthlyCharges'].sum() * 12
print(f"\nPotential Annual Revenue at Risk from High-Risk Customers: ${revenue_at_risk:.2f}")


Potential Annual Revenue at Risk from High-Risk Customers: $64608.60


In [None]:
# Calculate mean MonthlyCharges for churned vs retained customers
mean_monthly_charges_churned = df_dummy[df_dummy['Churn'] == 1]['MonthlyCharges'].mean()
mean_monthly_charges_retained = df_dummy[df_dummy['Churn'] == 0]['MonthlyCharges'].mean()

print(f"Average Monthly Charges (Churned): ${mean_monthly_charges_churned:.2f}")
print(f"Average Monthly Charges (Retained): ${mean_monthly_charges_retained:.2f}")

# Tenure distribution for churned vs retained
print("\nTenure Distribution:")
print(df_dummy.groupby('Churn')['tenure'].describe())

# Impact of streaming services
streaming_impact = df_dummy.groupby(['PhoneService_Yes', 'Churn']).size().unstack()
print(streaming_impact)
streaming_impact['Churn_Rate'] = streaming_impact[1] / (streaming_impact[1] + streaming_impact[0])

print("\nChurn Rate Based on Streaming Services:")
print(streaming_impact)


Average Monthly Charges (Churned): $74.44
Average Monthly Charges (Retained): $61.27

Tenure Distribution:
        count       mean        std   min   25%   50%   75%   max
Churn                                                            
0      5174.0  37.569986  24.113744  0.01  15.0  38.0  61.0  72.0
1      1869.0  17.979133  19.531123  1.00   2.0  10.0  29.0  72.0
Churn                0     1
PhoneService_Yes            
0                  512   170
1                 4662  1699

Churn Rate Based on Streaming Services:
Churn                0     1  Churn_Rate
PhoneService_Yes                        
0                  512   170    0.249267
1                 4662  1699    0.267096


: 

In [None]:
cph.predict

In [102]:
cph.predict_survival_function(original_df)

Unnamed: 0,0
0.01,1.000000
1.00,0.964352
2.00,0.951490
3.00,0.941180
4.00,0.931750
...,...
68.00,0.672566
69.00,0.664076
70.00,0.650702
71.00,0.641729
