In [1]:
import pandas as pd

# Load the dataset
file_path = '/Users/akankshanakati/Desktop/project/Churn_Modelling.csv'
data = pd.read_csv(file_path)

# Check for missing values
missing_values = data.isnull().sum()
missing_values


RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [2]:
# Display the first few rows of the dataset
data_head = data.head()

# Generate summary statistics for the dataset
summary_statistics = data.describe()

data_head, summary_statistics


(   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
 0          1    15634602  Hargrave          619    France  Female   42   
 1          2    15647311      Hill          608     Spain  Female   41   
 2          3    15619304      Onio          502    France  Female   42   
 3          4    15701354      Boni          699    France  Female   39   
 4          5    15737888  Mitchell          850     Spain  Female   43   
 
    Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
 0       2       0.00              1          1               1   
 1       1   83807.86              1          0               1   
 2       8  159660.80              3          1               0   
 3       1       0.00              2          0               0   
 4       2  125510.82              1          1               1   
 
    EstimatedSalary  Exited  
 0        101348.88       1  
 1        112542.58       0  
 2        113931.57       1  
 3         93826.63     

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# One-hot encode 'Geography'
geography_ohe = pd.get_dummies(data['Geography'], prefix='Geo')

# Label encode 'Gender'
label_encoder = LabelEncoder()
gender_encoded = label_encoder.fit_transform(data['Gender'])
data['Gender'] = gender_encoded

# Concatenate the new one-hot encoded columns with the original dataframe
data_encoded = pd.concat([data, geography_ohe], axis=1).drop('Geography', axis=1)

# Show the first few rows of the updated dataframe to verify changes
data_encoded.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_France,Geo_Germany,Geo_Spain
0,1,15634602,Hargrave,619,0,42,2,0.0,1,1,1,101348.88,1,True,False,False
1,2,15647311,Hill,608,0,41,1,83807.86,1,0,1,112542.58,0,False,False,True
2,3,15619304,Onio,502,0,42,8,159660.8,3,1,0,113931.57,1,True,False,False
3,4,15701354,Boni,699,0,39,1,0.0,2,0,0,93826.63,0,True,False,False
4,5,15737888,Mitchell,850,0,43,2,125510.82,1,1,1,79084.1,0,False,False,True


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Features for scaling
features_to_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

# Applying StandardScaler
scaler = StandardScaler()
data_encoded[features_to_scale] = scaler.fit_transform(data_encoded[features_to_scale])

# Split the data into features and target
X = data_encoded.drop(['HasCrCard'], axis=1)  # All other columns as features
y = data_encoded['HasCrCard']  # Target variable

# Splitting the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Verifying the shape of the train and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((8000, 15), (2000, 15), (8000,), (2000,))

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
# Drop non-numeric and irrelevant columns for modeling
columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']
X_train_dropped = X_train.drop(columns=columns_to_drop, axis=1)
X_test_dropped = X_test.drop(columns=columns_to_drop, axis=1)

# Retrain the Logistic Regression model with the adjusted dataset
logistic_model.fit(X_train_dropped, y_train)

# Predict on the test set with the adjusted dataset
y_pred_dropped = logistic_model.predict(X_test_dropped)

# Calculate evaluation metrics for the adjusted model
accuracy_dropped = accuracy_score(y_test, y_pred_dropped)
precision_dropped = precision_score(y_test, y_pred_dropped)
recall_dropped = recall_score(y_test, y_pred_dropped)
roc_auc_dropped = roc_auc_score(y_test, logistic_model.predict_proba(X_test_dropped)[:, 1])

accuracy_dropped, precision_dropped, recall_dropped, roc_auc_dropped


(0.7135, 0.7135, 1.0, 0.5105121742118773)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting model
gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
gbm_model.fit(X_train_dropped, y_train)

# Predict on the test set
y_pred_gbm = gbm_model.predict(X_test_dropped)

# Calculate evaluation metrics
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
precision_gbm = precision_score(y_test, y_pred_gbm)
recall_gbm = recall_score(y_test, y_pred_gbm)
roc_auc_gbm = roc_auc_score(y_test, gbm_model.predict_proba(X_test_dropped)[:, 1])

accuracy_gbm, precision_gbm, recall_gbm, roc_auc_gbm


(0.7105, 0.7132796780684104, 0.9936930623686054, 0.5064695947392044)

In [8]:
from joblib import dump

# Assuming 'gbm_model' is your trained Gradient Boosting Machine model
dump(gbm_model, 'gbm_model.joblib')

print("Model saved successfully.")


Model saved successfully.
