In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
# Load the dataset
data = pd.read_csv('/Users/azkayounus/Desktop/CODSOFT/Project3/Churn_Modelling.csv')


In [7]:


# Drop irrelevant columns (e.g., RowNumber, CustomerId, Surname)
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Drop duplicate rows
data.drop_duplicates(inplace=True)

# Drop rows with null values
data.dropna(inplace=True)

# Split features and target variable


In [8]:

# Split features and target variable
X = data.drop('Exited', axis=1)
y = data['Exited']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:


# Define numerical and categorical features
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']



In [10]:


# Preprocessing pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])



In [11]:


# Logistic Regression model
lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

# Random Forest model
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier())])

# Gradient Boosting model
gb_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', GradientBoostingClassifier())])




In [12]:


# Train and evaluate models
models = {'Logistic Regression': lr_model,
          'Random Forest': rf_model,
          'Gradient Boosting': gb_model}



In [13]:


for name, model in models.items():
    print(f'Training {name}...')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f'{name} Model Accuracy: {accuracy_score(y_test, y_pred)}\n')
    print(f'Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}\n')
    print(f'Classification Report for {name}:\n{classification_report(y_test, y_pred)}\n')


Training Logistic Regression...
Logistic Regression Model Accuracy: 0.811

Confusion Matrix for Logistic Regression:
[[1543   64]
 [ 314   79]]

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000


Training Random Forest...
Random Forest Model Accuracy: 0.8685

Confusion Matrix for Random Forest:
[[1548   59]
 [ 204  189]]

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.48      0.59       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.76      2000
weighted avg       0.86      0.87      