In [2]:
import pandas as pd
from scipy.stats import chi2_contingency
import seaborn as sns


In [3]:
csv_path = "./milestone2_dataset.csv"

df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.shape

(7021, 20)

In [5]:
df.duplicated().sum()

np.int64(0)

# Advanced Data Analysis

Chi-Square Test for categorical variables

In [6]:

object_columns = df.select_dtypes(include="object").columns

for col in object_columns:
    if col == "Churn":
        continue

    try:
        contingency = pd.crosstab(df['Churn'], df[col])
        chi2, p, dof, ex = chi2_contingency(contingency)
        print(f"{col}: p-value = {p}")
    except Exception as e:
        print(f"Error with column {col}: {e}")


gender: p-value = 0.4794615390898278
Partner: p-value = 1.0999615978939857e-35
Dependents: p-value = 1.5995676694071902e-42
PhoneService: p-value = 0.3664940064978196
MultipleLines: p-value = 0.002068559953130916
InternetService: p-value = 1.0857790034735196e-159
OnlineSecurity: p-value = 1.3292265237878154e-184
OnlineBackup: p-value = 8.614898409283196e-131
DeviceProtection: p-value = 1.9555513236078665e-121
TechSupport: p-value = 7.14685771582873e-180
StreamingTV: p-value = 5.469454144000475e-82
StreamingMovies: p-value = 2.750289622934001e-82
Contract: p-value = 6.442284610020534e-257
PaperlessBilling: p-value = 2.1518977990247168e-57
PaymentMethod: p-value = 3.0267752570875056e-139


After performing the Chi-Square test on categorical variables, we observed that gender (p-value = 0.479) and PhoneService (p-value = 0.366) have high p-values, indicating they are not significantly associated with customer churn. Therefore, we decided to drop these two columns from the dataset as they are unlikely to contribute to model performance.

In [7]:
df.drop(columns=['gender', 'PhoneService'], inplace=True)
print(df.columns.tolist())


['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [8]:
df['charges_per_month'] = df['TotalCharges'] / (df['tenure'] + 1)
df['is_long_term'] = (df['tenure'] > 12).astype(int)

In [9]:
from scipy.stats import chi2_contingency
import pandas as pd

contingency_table = pd.crosstab(df['is_long_term'], df['Churn'])

chi2, p, dof, expected = chi2_contingency(contingency_table)
print("Chi-Square p-value for is_long_term:", p)

if p < 0.05:
    print("is_long_term is significantly associated with Churn ✅")
else:
    print("is_long_term is NOT significantly associated with Churn ❌")


Chi-Square p-value for is_long_term: 1.1165310005866383e-154
is_long_term is significantly associated with Churn ✅


In [10]:
from scipy.stats import ttest_ind

group_yes = df[df['Churn'] == 'Yes']['charges_per_month']
group_no = df[df['Churn'] == 'No']['charges_per_month']

t_stat, p = ttest_ind(group_yes, group_no)
print("t-test p-value for charges_per_month:", p)

if p < 0.05:
    print("charges_per_month is significantly different between churn groups ✅")
else:
    print("charges_per_month is NOT significantly different ❌")


t-test p-value for charges_per_month: 3.433856625251188e-10
charges_per_month is significantly different between churn groups ✅


In [11]:
print(df.columns.tolist())

['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn', 'charges_per_month', 'is_long_term']


In [12]:
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,charges_per_month,is_long_term
0,0,Yes,No,1,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,14.925,0
1,0,No,No,34,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,53.985714,1
2,0,No,No,2,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,36.05,0
3,0,No,No,45,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,40.016304,1
4,0,No,No,2,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,50.55,0


In [13]:
df.to_csv('strelmint_dashboard.csv', index=False)

print("Data saved successfully to 'strelmint_dashboard.csv'")

Data saved successfully to 'strelmint_dashboard.csv'


In [14]:
df.to_csv('milestone3.csv', index=False)

print("Data saved successfully to 'strelmint_dashboard.csv'")

Data saved successfully to 'strelmint_dashboard.csv'
