In [196]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report , confusion_matrix , accuracy_score
import joblib


In [198]:
df = pd.read_csv('Churn_Modelling.csv')
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [200]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [202]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [204]:
df[df.duplicated()]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


# Feature Engineering


In [207]:
#binary feature for balance
df['BalanceZero']=(df['Balance']==0).astype(int)

#Age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=[18,25,35,45,55,65,75,85,95],labels = ['18-25','25-35','35-45','45-55','55-65','65-75','75-85','85-95'])

#Balance to salary ratio
df['BalanceToSalaryRatio'] = df['Balance']/df['EstimatedSalary']

#Interaction feature between NumOfProducts and IsActiveMember
df['ProductUsage'] = df['NumOfProducts'] * df['IsActiveMember']

#Tenure grouping
df['TenureGroup'] = pd.cut(df['Tenure'] , bins=[0,2,4,6,8,10], labels=['0-2','3-4','5-6','7-8','9-10'])

# Encoding data

In [210]:
df= pd.get_dummies(df , columns=["Geography",'AgeGroup','TenureGroup'],drop_first = True) #one hot encoding
label_encoder = LabelEncoder()
label_encoder.fit(['Female', 'Male'])
df['Gender'] = label_encoder.transform(df['Gender']) #lable encoding
joblib.dump(label_encoder, 'fitted_label_encoder.pkl')

['fitted_label_encoder.pkl']

In [212]:
df.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,...,AgeGroup_35-45,AgeGroup_45-55,AgeGroup_55-65,AgeGroup_65-75,AgeGroup_75-85,AgeGroup_85-95,TenureGroup_3-4,TenureGroup_5-6,TenureGroup_7-8,TenureGroup_9-10
0,1,15634602,Hargrave,619,0,42,2,0.0,1,1,...,True,False,False,False,False,False,False,False,False,False
1,2,15647311,Hill,608,0,41,1,83807.86,1,0,...,True,False,False,False,False,False,False,False,False,False


In [214]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited', 'BalanceZero', 'BalanceToSalaryRatio',
       'ProductUsage', 'Geography_Germany', 'Geography_Spain',
       'AgeGroup_25-35', 'AgeGroup_35-45', 'AgeGroup_45-55', 'AgeGroup_55-65',
       'AgeGroup_65-75', 'AgeGroup_75-85', 'AgeGroup_85-95', 'TenureGroup_3-4',
       'TenureGroup_5-6', 'TenureGroup_7-8', 'TenureGroup_9-10'],
      dtype='object')

In [216]:
features = ['CreditScore', 'Gender', 'Age','Tenure', 'Balance', 'NumOfProducts',
              'HasCrCard', 'IsActiveMember','EstimatedSalary','BalanceZero','BalanceToSalaryRatio',
       'ProductUsage', 'Geography_Germany', 'Geography_Spain',
       'AgeGroup_25-35', 'AgeGroup_35-45', 'AgeGroup_45-55', 'AgeGroup_55-65',
       'AgeGroup_65-75', 'AgeGroup_75-85', 'AgeGroup_85-95', 'TenureGroup_3-4',
       'TenureGroup_5-6', 'TenureGroup_7-8', 'TenureGroup_9-10']
x = df[features]
y = df['Exited']

# train test spliting

In [219]:
x_train ,x_test , y_train ,y_test = train_test_split(x,y, test_size=0.2 ,random_state=42)

# feature scalling

In [222]:
cols_to_scale = ['CreditScore','Age','Tenure','Balance','EstimatedSalary','BalanceToSalaryRatio']
scaler = StandardScaler()
x_train[cols_to_scale] = scaler.fit_transform(x_train[cols_to_scale])
x_test[cols_to_scale] = scaler.transform(x_test[cols_to_scale])
joblib.dump(scaler, 'fitted_standard_scaler.pkl')

['fitted_standard_scaler.pkl']

In [224]:
x_train.head(3)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,BalanceZero,...,AgeGroup_35-45,AgeGroup_45-55,AgeGroup_55-65,AgeGroup_65-75,AgeGroup_75-85,AgeGroup_85-95,TenureGroup_3-4,TenureGroup_5-6,TenureGroup_7-8,TenureGroup_9-10
9254,0.3565,1,-0.655786,0.34568,-1.218471,2,1,1,1.36767,1,...,False,False,False,False,False,False,False,True,False,False
1561,-0.203898,1,0.294938,-0.348369,0.696838,2,1,1,1.661254,0,...,True,False,False,False,False,False,True,False,False,False
1670,-0.961472,1,-1.416365,-0.695393,0.618629,1,1,0,-0.252807,0,...,False,False,False,False,False,False,True,False,False,False


# Random Forest Model

In [228]:
#build and train the Random Forest model
RandomForest_model = RandomForestClassifier(n_estimators=100 , random_state=42,class_weight='balanced')
RandomForest_model.fit(x_train , y_train)

In [229]:
#make predictions
y_pred = RandomForest_model.predict(x_test)
joblib.dump(RandomForest_model, 'simple_rf_model.pkl')

['simple_rf_model.pkl']

In [232]:
#evaluate the model
conf_matrix = confusion_matrix(y_test , y_pred)
class_report = classification_report(y_test , y_pred)
accuracy = accuracy_score(y_test , y_pred)

In [234]:
print(conf_matrix)

[[1552   55]
 [ 218  175]]


In [236]:
print(class_report)

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.76      0.45      0.56       393

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [238]:
print(accuracy)

0.8635


# pipline to predict on New Incoming Data

In [241]:
LABEL_ENCODER_PATH = 'fitted_label_encoder.pkl'
MODEL_PATH = 'simple_rf_model.pkl'
SCALER_PATH = 'fitted_standard_scaler.pkl'
PREDICTION_THRESHOLD = 0.5 

# Define the columns that were scaled (continuous features)
COLS_TO_SCALE = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary', 'BalanceToSalaryRatio']

# Define the final features list the model expects (in the correct order)
# This is crucial for matching features after encoding/engineering
FINAL_FEATURES = [
    'CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
    'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'BalanceZero', 'BalanceToSalaryRatio',
    'ProductUsage', 'Geography_Germany', 'Geography_Spain',
    'AgeGroup_25-35', 'AgeGroup_35-45', 'AgeGroup_45-55', 'AgeGroup_55-65',
    'AgeGroup_65-75', 'AgeGroup_75-85', 'AgeGroup_85-95', 'TenureGroup_3-4',
    'TenureGroup_5-6', 'TenureGroup_7-8', 'TenureGroup_9-10'
]

In [243]:
def load_artifacts():
    try:
        model = joblib.load(MODEL_PATH)
        scaler = joblib.load(SCALER_PATH)
        label_encoder = joblib.load(LABEL_ENCODER_PATH)
        print("Artifacts loaded successfully (Model, Scaler, LabelEncoder).")
        return model, scaler, label_encoder
    except FileNotFoundError:
        print(f" Error: Required files not found at {MODEL_PATH} or {SCALER_PATH} 0r{LABEL_ENCODER_PATH}. Check your paths.")
        return None, None, None

In [253]:
def replicate_feature_engineering(df, label_encoder):
    X_new = df.copy()

    X_new['Gender'] = label_encoder.transform(X_new['Gender'])
    

    X_new['BalanceZero'] = (X_new['Balance'] == 0).astype(int)

    X_new['AgeGroup'] = pd.cut(X_new['Age'], bins=[18, 25, 35, 45, 55, 65, 75, 85, 95],
                               labels=['18-25', '25-35', '35-45', '45-55', '55-65', '65-75', '75-85', '85-95'],
                               right=True, include_lowest=True)

    X_new['BalanceToSalaryRatio'] = X_new['Balance'] / (X_new['EstimatedSalary'].replace(0, 1e-6))

    X_new['ProductUsage'] = X_new['NumOfProducts'] * X_new['IsActiveMember']

    X_new['TenureGroup'] = pd.cut(X_new['Tenure'], bins=[0, 2, 4, 6, 8, 10],
                                 labels=['0-2', '3-4', '5-6', '7-8', '9-10'],
                                 right=True, include_lowest=True)
    print('feature engineering done')
    return X_new

In [249]:
def preprocess_and_align(X_engineered, scaler):
    #  One-Hot Encoding , Scaling (Transform ONLY) is done here apart from feature engineering
    
    X_processed = pd.get_dummies(X_engineered, 
                                 columns=["Geography", 'AgeGroup', 'TenureGroup'], 
                                 drop_first=True)

    X_processed[COLS_TO_SCALE] = scaler.transform(X_processed[COLS_TO_SCALE])
    
    # FEATURE ALIGNMENT (CRITICAL)
    # Add any missing dummy columns from training that aren't in the new data (set to 0)
    missing_cols = set(FINAL_FEATURES) - set(X_processed.columns)
    for col in missing_cols:
        X_processed[col] = 0
        
    # Ensure the final DataFrame has features in the exact order the model expects
    X_final = X_processed[FINAL_FEATURES]

    print(" Data successfully preprocessed and aligned.")
    return X_final

In [251]:
def predict_churn(model, X_final_df):
    
    churn_probabilities = model.predict_proba(X_final_df)[:, 1]
    
    final_predictions = (churn_probabilities >= PREDICTION_THRESHOLD).astype(int)
    
    results = pd.DataFrame({ 'Churn_Probability': churn_probabilities,
                            'Churn_Prediction': final_predictions }, index=X_final_df.index)
    # 1=Will Churn, 0=Will Not Churn
    print(f" Predictions generated using threshold: {PREDICTION_THRESHOLD}")
    return results

In [255]:
def run_pipeline(new_raw_data_df):
    print("--- Starting Churn Prediction Pipeline ---")
    
    # 1. Load Artifacts
    model, scaler, label_encoder = load_artifacts()
    if model is None or scaler is None:
        return None
    
    # 2. Replicate Feature Engineering
    X_engineered = replicate_feature_engineering(new_raw_data_df, label_encoder)
    
    # 3. Preprocess and Align Data
    X_final = preprocess_and_align(X_engineered, scaler)
    
    # 4. Generate Predictions
    results = predict_churn(model, X_final)
    
    print("--- Pipeline Complete ---")
    return results

In [257]:
new_raw_data = pd.DataFrame({
    'CreditScore': [650, 800, 500],
    'Gender': ['Male', 'Female', 'Male'],
    'Age': [40, 60, 25],
    'Tenure': [3, 8, 1],
    'Balance': [120000.0, 0.0, 5000.0],
    'NumOfProducts': [1, 2, 1],
    'HasCrCard': [1, 1, 0],
    'IsActiveMember': [0, 1, 0],
    'EstimatedSalary': [60000.0, 150000.0, 90000.0],
    'Geography': ['Germany', 'France', 'Spain'],
})

prediction_results = run_pipeline(new_raw_data)

if prediction_results is not None:
    print("\nPrediction Results:")
    print(prediction_results)

--- Starting Churn Prediction Pipeline ---
Artifacts loaded successfully (Model, Scaler, LabelEncoder).
feature engineering done
 Data successfully preprocessed and aligned.
 Predictions generated using threshold: 0.5
--- Pipeline Complete ---

Prediction Results:
   Churn_Probability  Churn_Prediction
0               0.62                 1
1               0.10                 0
2               0.10                 0
