In [24]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
df = pd.read_csv(r"D:\customer churn project\cleaned_csv_file Churn_Modelling.csv")

In [3]:
df

Unnamed: 0,creditscore,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,churn,geography_Germany,geography_Spain
0,619,0,42,2,0.00,1,1,1,1,0,0
1,608,0,41,1,83807.86,1,0,1,0,0,1
2,502,0,42,8,159660.80,3,1,0,1,0,0
3,699,0,39,1,0.00,2,0,0,0,0,0
4,850,0,43,2,125510.82,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,0,0,0
9996,516,1,35,10,57369.61,1,1,1,0,0,0
9997,709,0,36,7,0.00,1,0,1,1,0,0
9998,772,1,42,3,75075.31,2,1,0,1,1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   creditscore        10000 non-null  int64  
 1   gender             10000 non-null  int64  
 2   age                10000 non-null  int64  
 3   tenure             10000 non-null  int64  
 4   balance            10000 non-null  float64
 5   numofproducts      10000 non-null  int64  
 6   hascrcard          10000 non-null  int64  
 7   isactivemember     10000 non-null  int64  
 8   churn              10000 non-null  int64  
 9   geography_Germany  10000 non-null  int64  
 10  geography_Spain    10000 non-null  int64  
dtypes: float64(1), int64(10)
memory usage: 859.5 KB


In [5]:
df.isnull().sum()

creditscore          0
gender               0
age                  0
tenure               0
balance              0
numofproducts        0
hascrcard            0
isactivemember       0
churn                0
geography_Germany    0
geography_Spain      0
dtype: int64

In [6]:
X = df.drop('churn', axis=1)
y = df['churn']

In [7]:
X

Unnamed: 0,creditscore,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,geography_Germany,geography_Spain
0,619,0,42,2,0.00,1,1,1,0,0
1,608,0,41,1,83807.86,1,0,1,0,1
2,502,0,42,8,159660.80,3,1,0,0,0
3,699,0,39,1,0.00,2,0,0,0,0
4,850,0,43,2,125510.82,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,0,0
9996,516,1,35,10,57369.61,1,1,1,0,0
9997,709,0,36,7,0.00,1,0,1,0,0
9998,772,1,42,3,75075.31,2,1,0,1,0


In [8]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: churn, Length: 10000, dtype: int64

In [11]:
# splitting data into train test section

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# scaling with Standard scaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
scaler

In [16]:
# fitting model with Logistic Regression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [17]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.812
Confusion Matrix:
 [[1544   63]
 [ 313   80]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.56      0.20      0.30       393

    accuracy                           0.81      2000
   macro avg       0.70      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000



In [None]:
# this is imbalancing data , resulting in misleading prediction

In [19]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn, imblearn

   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ------------


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())


Before SMOTE: churn
0    6356
1    1644
Name: count, dtype: int64
After SMOTE: churn
0    6356
1    6356
Name: count, dtype: int64


In [21]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)



In [25]:
# Logistic Regression
model = LogisticRegression()
model.fit(X_train_res, y_train_res)
print("LR:", model.score(X_test, y_test))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res)
print("RF:", rf.score(X_test, y_test))

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_res, y_train_res)
print("GB:", gb.score(X_test, y_test))


LR: 0.7265
RF: 0.84
GB: 0.842


In [26]:

for name, model in [("Logistic Regression", model), ("Random Forest", rf), ("Gradient Boosting", gb)]:
    print(f"\n{name}")
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))



Logistic Regression
[[1170  437]
 [ 110  283]]
              precision    recall  f1-score   support

           0       0.91      0.73      0.81      1607
           1       0.39      0.72      0.51       393

    accuracy                           0.73      2000
   macro avg       0.65      0.72      0.66      2000
weighted avg       0.81      0.73      0.75      2000


Random Forest
[[1446  161]
 [ 159  234]]
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1607
           1       0.59      0.60      0.59       393

    accuracy                           0.84      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.84      0.84      0.84      2000


Gradient Boosting
[[1409  198]
 [ 118  275]]
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      1607
           1       0.58      0.70      0.64       393

    accuracy                           0.84   

In [None]:
# here gradient boosting gives the better accuracy then other algos 

In [31]:
import joblib
joblib.dump(gb,'churn_gb_model.pkl')
joblib.dump(scaler,'scaler.pkl')



['scaler.pkl']