##### Problem Statement : Predict whether a customer will churn based on historical data.

In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import os

In [2]:
%pwd

'c:\\Users\\HP\\Desktop\\Sharedfolder\\New folder\\research'

In [3]:
os.chdir("../")
%pwd

'c:\\Users\\HP\\Desktop\\Sharedfolder\\New folder'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir : Path
    source_url : str
    data_path : Path
    unzip_dir : Path


In [5]:
import os
from box.exceptions import BoxValueError
import yaml
from Customer_Churn_Prediction import logger
import json
import joblib
from ensure import ensure_annotations
from box import Box
from pathlib import Path

In [6]:

@ensure_annotations
def read_yaml(path_to_yaml:Path)-> Box:
    """reads yaml file and return config box type"""
    try:
        with open(path_to_yaml) as yaml_file:
            content = yaml.safe_load(yaml_file)

            logger.info(f"yaml_file: {path_to_yaml} loaded successfully")
            return Box(content)

    except BoxValueError:
        raise ValueError("yaml file is empty")

    except Exception as e:
        raise e


@ensure_annotations
def create_directories(path_to_directories:list,verbose=True):
    """Create list of directories"""
    for path in path_to_directories:
        os.makedirs(path,exist_ok=True)
        if verbose:
            logger.info(f"created directory at : {path}")

@ensure_annotations
def save_json(path:Path,data:dict):
    """ save json data """
    with open(path, "w") as f:
        json.dump(data, f, indent=4)
    logger.info(f"json file saved at : {path}")


In [7]:
from Customer_Churn_Prediction.constants import *
from Customer_Churn_Prediction.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
     
    def get_data_ingestion_config(self)->DataIngestionConfig:
        config = self.config.Data_Ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_url = config.source_url,
            data_path = config.data_path,
            unzip_dir = config.unzip_dir
        )

        return data_ingestion_config

        

In [9]:
import os
import urllib.request as request
import zipfile
from Customer_Churn_Prediction import logger
from Customer_Churn_Prediction.utils.common import get_size

In [10]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config

    def Download_file(self):
        if not os.path.exists(self.config.data_path):
            filename, headers = request.urlretrieve(
                url=self.config.source_url,
                filename = self.config.data_path            
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size : {get_size(Path(self.config.data_path))}")

    def Extract_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path,exist_ok=True)
        with zipfile.ZipFile(self.config.data_path,"r") as zip_ref:
            zip_ref.extractall(unzip_path)

In [11]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.Download_file()
    data_ingestion.Extract_file()
except Exception as e:
    raise e

[2025-03-08 20:22:07,724: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-08 20:22:07,829: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-08 20:22:07,881: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-08 20:22:07,926: INFO: common: created directory at: artifacts]
[2025-03-08 20:22:07,929: INFO: common: created directory at: artifacts/data_ingestion]
[2025-03-08 20:22:07,954: INFO: 1691150340: File already exists of size : ~ 302 KB]


# Data Validation

In [12]:
df = pd.read_csv('artifacts\data_ingestion\Customer-Churn-Records.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RowNumber           10000 non-null  int64  
 1   CustomerId          10000 non-null  int64  
 2   Surname             10000 non-null  object 
 3   CreditScore         10000 non-null  int64  
 4   Geography           10000 non-null  object 
 5   Gender              10000 non-null  object 
 6   Age                 10000 non-null  int64  
 7   Tenure              10000 non-null  int64  
 8   Balance             10000 non-null  float64
 9   NumOfProducts       10000 non-null  int64  
 10  HasCrCard           10000 non-null  int64  
 11  IsActiveMember      10000 non-null  int64  
 12  EstimatedSalary     10000 non-null  float64
 13  Exited              10000 non-null  int64  
 14  Complain            10000 non-null  int64  
 15  Satisfaction Score  10000 non-null  int64  
 16  Card 

In [14]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir : Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict


In [15]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.Data_Validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
        )

        return data_validation_config

In [16]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config


    def validate_all_columns(self)-> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [17]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-03-08 20:22:11,474: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-08 20:22:11,479: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-08 20:22:11,490: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-08 20:22:11,496: INFO: common: created directory at: artifacts]
[2025-03-08 20:22:11,502: INFO: common: created directory at: artifacts/data_validation]


# Data Transformation

In [18]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [19]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.Data_Transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [20]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def preprocessing_data(self):
        data = pd.read_csv(self.config.data_path)
        # drop columns
        data.drop(['RowNumber','CustomerId','Surname'],axis = 1,inplace=True)

        # seperate catgory and numrical columns
        df_cat = data.select_dtypes(include='object')
        df_num = data.select_dtypes(exclude='object')

        # One hot encoding
        df_cat = pd.get_dummies(df_cat, columns=['Geography', 'Gender', 'Card Type'], drop_first=True,dtype='int')

        # final dataframe
        dfp = pd.concat([df_cat,df_num],axis=1)

        # Handling imbalnce in the data
        X = dfp.drop(columns=["Exited"])
        y = dfp["Exited"]
        smote = SMOTE(sampling_strategy=0.5, random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        resampled_data = pd.concat([X_resampled,y_resampled],axis=1)

        # Train Test split
        train, test = train_test_split(resampled_data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        

In [21]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.preprocessing_data()
except Exception as e:
    raise e

[2025-03-08 20:23:44,352: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-08 20:23:44,359: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-08 20:23:44,368: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-08 20:23:44,371: INFO: common: created directory at: artifacts]
[2025-03-08 20:23:44,376: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-08 20:23:49,614: INFO: 3687575203: Splited data into training and test sets]
[2025-03-08 20:23:49,614: INFO: 3687575203: (8957, 18)]
[2025-03-08 20:23:49,630: INFO: 3687575203: (2986, 18)]
(8957, 18)
(2986, 18)


# Model Training

In [22]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    criterion: str
    max_depth: int
    target_column: str

In [23]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
            config = self.config.Model_Trainer
            params = self.params.DecisionTreeClassifier
            schema =  self.schema.TARGET_COLUMN

            create_directories([config.root_dir])

            model_trainer_config = ModelTrainerConfig(
                root_dir=config.root_dir,
                train_data_path = config.train_data_path,
                test_data_path = config.test_data_path,
                model_name = config.model_name,
                criterion = params.criterion,
                max_depth = params.max_depth,
                target_column = schema.name
                
            )

            return model_trainer_config

In [24]:
import joblib
from sklearn.tree import DecisionTreeClassifier

In [25]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    
    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)


        train_x = train_data.drop([self.config.target_column], axis=1)
        test_x = test_data.drop([self.config.target_column], axis=1)
        train_y = train_data[[self.config.target_column]]
        test_y = test_data[[self.config.target_column]]


        dtree = DecisionTreeClassifier(criterion = self.config.criterion, max_depth=self.config.max_depth, random_state=42)
        dtree.fit(train_x, train_y)

        joblib.dump(dtree, os.path.join(self.config.root_dir, self.config.model_name))

In [26]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-03-08 20:23:50,240: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-08 20:23:50,244: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-08 20:23:50,251: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-08 20:23:50,253: INFO: common: created directory at: artifacts]
[2025-03-08 20:23:50,260: INFO: common: created directory at: model]


# Model Evaluation

In [27]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_path: Path
    all_params: dict
    mlflow_uri: str
    metric_file_name: Path
    target_column: str

In [55]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.Model_Evaluation
        params = self.params.DecisionTreeClassifier
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            test_data_path=config.test_data_path,
            model_path = config.model_path,
            all_params=params,
            metric_file_name = config.metric_file_name,
            target_column = schema.name,
           # mlflow_uri = "https://dagshub.com/Akhilpm156/Customer_Churn_end_to_end_ml_project.mlflow"
            mlflow_uri = ""
           
        )

        return model_evaluation_config

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import mlflow
from urllib.parse import urlparse

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def load_model(self, path: Path):
        self.model = joblib.load(path)
        return self.model



    def eval_metrics(self):
        test_data = pd.read_csv(self.config.test_data_path)
        test_x = test_data.drop([self.config.target_column], axis=1)
        test_y = test_data[[self.config.target_column]]

        self.model=self.load_model(self.config.model_path)
        pred = self.model.predict(test_x)
        clsreport = classification_report(test_y, pred)
        cm = confusion_matrix(test_y, pred)
        accuracy = accuracy_score(test_y, pred)

        self.accuracy = accuracy
        self.clsreport = clsreport
        self.cm = cm

        return clsreport, cm, accuracy
        

    def save_results(self):
        # Saving metrics as local
        if not hasattr(self, 'accuracy'):
            raise ValueError("Call eval_metrics() before saving results.")

        scores = {"accuracy_score": self.accuracy}
        save_json(path=Path(self.config.metric_file_name), data=scores)



    def log_into_mlflow(self):

        if not hasattr(self, 'accuracy'):
            raise ValueError("Call eval_metrics() before logging to MLflow.")

        mlflow.set_tracking_uri(self.config.mlflow_uri)
        mlflow.set_registry_uri(self.config.mlflow_uri)
        mlflow.set_experiment("Customer_Churn_Experiment")

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        with mlflow.start_run():
            mlflow.log_params(self.config.all_params)
            mlflow.log_metrics(
                {"accuracy": self.accuracy}
            )

            with open(self.config.artifacts_root+"/classification_report.txt", "w") as f:
                f.write(self.clsreport)
            with open(self.config.artifacts_root+"/confusion_matrix.txt", "w") as f:
                f.write(str(self.cm))

            mlflow.log_artifact("classification_report.txt")
            mlflow.log_artifact("confusion_matrix.txt")

            if tracking_url_type_store != "file":

                # Register the model
                # There are other ways to use the Model Registry, which depends on the use case,
                # please refer to the doc for more information:
                # https://mlflow.org/docs/latest/model-registry.html#api-workflow

                mlflow.sklearn.log_model(self.model, "model", registered_model_name="DecisionTreeClassifier")
            else:
                mlflow.sklearn.log_model(self.model, "model")

In [58]:
try:
    # Load configuration
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    
    # Initialize ModelEvaluation
    model_evaluation = ModelEvaluation(config=model_evaluation_config)
    
    # Evaluate model
    cls_report, conf_matrix, accuracy = model_evaluation.eval_metrics()
    
    print("Classification Report:\n", cls_report)
    print("Confusion Matrix:\n", conf_matrix)
    print("Accuracy Score:", accuracy)

    # Save results
    model_evaluation.save_results()

    # Log results into MLflow
    model_evaluation.log_into_mlflow()

except Exception as e:
    print(f"An error occurred: {e}")


[2025-03-08 22:45:03,276: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-08 22:45:04,029: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-08 22:45:04,153: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-08 22:45:04,422: INFO: common: created directory at: artifacts]
[2025-03-08 22:45:04,515: INFO: common: created directory at: artifacts/model_evaluation]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2012
           1       0.99      1.00      1.00       974

    accuracy                           1.00      2986
   macro avg       1.00      1.00      1.00      2986
weighted avg       1.00      1.00      1.00      2986

Confusion Matrix:
 [[2006    6]
 [   3  971]]
Accuracy Score: 0.9969859343603483
[2025-03-08 22:45:14,577: INFO: 2880182927: json file saved at : artifacts\model_evaluation\metrics.json]


Registered model 'DecisionTreeClassifier' already exists. Creating a new version of this model...
Created version '2' of model 'DecisionTreeClassifier'.
