In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import joblib
import warnings 
warnings.filterwarnings('ignore')


In [71]:
df = pd.read_csv("customer_churn_data.csv")

In [72]:
print(df.shape)
print(df['Churn'].value_counts())


(1000, 10)
Churn
Yes    883
No     117
Name: count, dtype: int64


In [73]:
df = df.drop(columns=['CustomerID', 'TotalCharges'], errors='ignore')


In [74]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,stratify=y,random_state=42)


In [75]:
print(X_train.shape, X_test.shape)
print(y_train.value_counts())
print(y_test.value_counts())


(800, 7) (200, 7)
Churn
Yes    706
No      94
Name: count, dtype: int64
Churn
Yes    177
No      23
Name: count, dtype: int64


In [76]:
cat_cols = X_train.select_dtypes(include='object').columns

X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test  = pd.get_dummies(X_test,  columns=cat_cols, drop_first=True)

X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)


In [77]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def modelperformance(predictions):
    print("Accuracy Score :", accuracy_score(y_test, predictions))
    print("Precision Score:", precision_score(y_test, predictions, pos_label='Yes'))
    print("Recall Score   :", recall_score(y_test, predictions, pos_label='Yes'))
    print("F1 Score       :", f1_score(y_test, predictions, pos_label='Yes'))


In [78]:
rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=3,
    min_samples_leaf=20,
    random_state=42
)

rf.fit(X_train, y_train)


In [79]:
rf_pred=rf.predict(X_test)

In [80]:
modelperformance(rf_pred)

Accuracy Score : 0.885
Precision Score: 0.885
Recall Score   : 1.0
F1 Score       : 0.9389920424403183


In [81]:
log_model=LogisticRegression()

In [82]:
log_model.fit(X_train, y_train)


In [83]:
log_pred = log_model.predict(X_test)


In [84]:
modelperformance(log_pred)

Accuracy Score : 0.94
Precision Score: 0.9365079365079365
Recall Score   : 1.0
F1 Score       : 0.9672131147540983


In [85]:
dt_model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_leaf=10,
    class_weight='balanced',
    random_state=42
)

In [86]:
dt_model.fit(X_train, y_train)

In [87]:
dt_pred = dt_model.predict(X_test)

In [88]:
modelperformance(dt_pred)

Accuracy Score : 1.0
Precision Score: 1.0
Recall Score   : 1.0
F1 Score       : 1.0


In [89]:
joblib.dump(log_model, "model.pkl")
joblib.dump(X_train.columns, "features.pkl")


['features.pkl']