In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [51]:
df = pd.read_csv(r'C:\Users\afnan\Desktop\jupyetr notebook\customor_churn_cleaned.csv')
df.head()

Unnamed: 0,customer_id,name,age,gender,tenure,monthly_charges,total_charges,internet_service,payment_method,contract_type,churn
0,C001,Arun K,25,male,5,450.5,2252.5,fiber optic,UPI,month-to-month,Yes
1,C002,Neha S,33,female,12,590.0,7080.0,fiber optic,Credit Card,month-to-month,No
2,C003,Rahul Menon,34,male,24,670.0,16080.0,dsl,UPI,one year,Yes
3,C004,Meera T,29,female,24,550.75,13218.0,fiber optic,Debit Card,month-to-month,No
4,C005,Aswin P,46,male,60,799.0,47940.0,fiber optic,Cash,two year,Yes


In [52]:
df_original= df.copy()

In [53]:
df=df.drop(["name","customer_id"],axis=1)

In [54]:
df['churn'] = df['churn'].map({'Yes':1, 'No':0})  # numeric
categorical_cols = ['gender', 'payment_method', 'internet_service', 'contract_type']  # example
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_encoded.head()


Unnamed: 0,age,tenure,monthly_charges,total_charges,churn,gender_female,gender_male,payment_method_Credit Card,payment_method_Debit Card,payment_method_Netbanking,payment_method_UPI,internet_service_fiber optic,contract_type_one year,contract_type_two year
0,25,5,450.5,2252.5,1,False,True,False,False,False,True,True,False,False
1,33,12,590.0,7080.0,0,True,False,True,False,False,False,True,False,False
2,34,24,670.0,16080.0,1,False,True,False,False,False,True,False,True,False
3,29,24,550.75,13218.0,0,True,False,False,True,False,False,True,False,False
4,46,60,799.0,47940.0,1,False,True,False,False,False,False,True,False,True


In [55]:
X = df_encoded.drop('churn', axis=1)
y = df_encoded['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [56]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.8571428571428571
[[5 1]
 [1 7]]
              precision    recall  f1-score   support

           0       0.83      0.83      0.83         6
           1       0.88      0.88      0.88         8

    accuracy                           0.86        14
   macro avg       0.85      0.85      0.85        14
weighted avg       0.86      0.86      0.86        14



In [58]:
## random foret

In [59]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


In [60]:
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 1.0
[[6 0]
 [0 8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         8

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14



In [61]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Logistic Regression Accuracy: 0.8571428571428571
Random Forest Accuracy: 1.0


In [62]:
feature_importance = pd.Series(
    rf_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

feature_importance.head(10)


gender_female                0.319753
gender_male                  0.250593
monthly_charges              0.134994
total_charges                0.125381
age                          0.089139
tenure                       0.055289
payment_method_Debit Card    0.009391
payment_method_UPI           0.005018
contract_type_two year       0.004916
payment_method_Netbanking    0.002315
dtype: float64

In [64]:
df_original['churn_prediction'] = rf_model.predict(X)

df_original[['customer_id', 'name', 'churn_prediction']].head(10)


Unnamed: 0,customer_id,name,churn_prediction
0,C001,Arun K,1
1,C002,Neha S,0
2,C003,Rahul Menon,1
3,C004,Meera T,0
4,C005,Aswin P,1
5,C006,Athira S,0
6,C007,Harish K,1
7,C008,Divya M,0
8,C009,Suraj P,1
9,C011,Abhijith K,1


In [65]:

df_original['churn_probability'] = rf_model.predict_proba(X)[:, 1]

df_original['churn_percentage'] = (df_original['churn_probability'] * 100).round(2)


In [66]:
df_original[['customer_id', 'name', 'churn_percentage']].head()


Unnamed: 0,customer_id,name,churn_percentage
0,C001,Arun K,73.0
1,C002,Neha S,3.0
2,C003,Rahul Menon,100.0
3,C004,Meera T,1.0
4,C005,Aswin P,100.0


In [67]:
df_original['churn_prediction_label'] = df_original['churn_prediction'].map({
    1: 'Yes',
    0: 'No'
})


In [68]:
df_original['churn_percentage'] = (
    rf_model.predict_proba(X)[:, 1] * 100
).round(2)


In [69]:
df_original[
    ['customer_id', 'name', 'churn_prediction_label', 'churn_percentage']
].head()


Unnamed: 0,customer_id,name,churn_prediction_label,churn_percentage
0,C001,Arun K,Yes,73.0
1,C002,Neha S,No,3.0
2,C003,Rahul Menon,Yes,100.0
3,C004,Meera T,No,1.0
4,C005,Aswin P,Yes,100.0


In [70]:
df_original.to_csv("customer_churn_prediction_result.csv", index=False)


In [71]:
import joblib
joblib.dump(rf_model, "random_forest_churn_model.pkl")


['random_forest_churn_model.pkl']

In [72]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
[[6 0]
 [0 8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         8

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

