# DataIngestion

In [None]:
from dataclasses import dataclass
from datetime import datetime
import os
import numpy as np 
import os
import sys
from sklearn.model_selection import train_test_split
import pandas as pd


DATABASE_NAME = 'NETWORK_SECURITY'
COLLECTION_NAME = 'NETWORK_DATA' 
MONGODB_URL = 'MONGODB_URL'

ARTIFACTS_DIR = 'artifacts'
PIPELINE_DIR = 'security'

DATA_INGESTION_DIR_NAME : str = 'data_ingestion'
DATA_INGESTION_COLLECTION_NAME: str = 'NETWORK_DATA'
DATA_INGESTION_FEATURE_STORED_NAME:str = 'feature'
DATA_INGESTION_INGESTED_NAME:str = 'ingested'
DATA_INGESTION_SPLIT_RATIO:float = 0.2 

RAW_DATA = 'security.csv'
TRAIN_DATA = 'train.csv'
TEST_DATA = 'test.csv'

TIMESTAMP = datetime.now().strftime('%m_%d_%Y_%H_%M_%S')

@dataclass 
class TrainingConfiguration:
    artifact_dir:str = ARTIFACTS_DIR 
    piprline_dir:str = PIPELINE_DIR 
    timestamp:str = TIMESTAMP 

trainingconfig : TrainingConfiguration=TrainingConfiguration()

@dataclass 
class Data_ingestion_configeration:
    data_ingestion_dir:str = os.path.join(trainingconfig.artifact_dir,DATA_INGESTION_DIR_NAME)
    data_ingestion_collection:str = DATA_INGESTION_COLLECTION_NAME 
    data_ingestion_feature:str = os.path.join(data_ingestion_dir,DATA_INGESTION_FEATURE_STORED_NAME,RAW_DATA)
    train_data_path:str = os.path.join(data_ingestion_dir,DATA_INGESTION_INGESTED_NAME,TRAIN_DATA)
    test_data_path:str = os.path.join(data_ingestion_dir,DATA_INGESTION_INGESTED_NAME,TEST_DATA)
    split_ratio:float = DATA_INGESTION_SPLIT_RATIO



# mongodb
from Network_Security.constant import MONGOBD_URL, DATA_BASE_NAME
from Network_Security.exception.exception import NetworkSecurityException
from Network_Security.logging.logger import logging
from dotenv import load_dotenv 
import certifi 
import pymongo
import sys
import os 


load_dotenv()
# MONGOBD_URL = os.getenv("MONGOBD_URL")
ca = certifi.where()  

class MongoDBClient:
    def __init__(self, database=DATA_BASE_NAME):
        try:
            mongo_url = os.getenv(MONGOBD_URL)
            if mongo_url is None:
                logging.info("MongoDB URL not found in environment variables")
                raise ValueError("MongoDB URL is missing")

            MongoDBClient.client = pymongo.MongoClient(mongo_url, tlsCAFile=ca)
            self.client = MongoDBClient.client 
            self.database = self.client[database]
            self.database_name = database  

        except Exception as e:
            raise NetworkSecurityException(e, sys)
 

#networkdata_acces
from Network_Security.exception.exception import NetworkSecurityException
from Network_Security.logging.logger import logging
from Network_Security.configeration.mongodb import MongoDBClient 
from typing import Optional
import pandas as pd
import numpy as np 
import sys


class NetworkData:
    def __init__(self):
        try:
            self.mongo_client = MongoDBClient()   
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def get_dataframe(self, collection_name: str, database_name: Optional[str] = None)->pd.DataFrame:
        try:
            if database_name:
                collection = self.mongo_client.client[database_name][collection_name]
            else:
                collection = self.mongo_client.database[collection_name]

            df = pd.DataFrame(list(collection.find()))
            if "_id" in df.columns:
                df.drop(columns=["_id"], inplace=True)
            df.replace("na", np.nan, inplace=True)

            logging.info("DataFrame Extract Successful")
            return df

        except Exception as e:
            raise NetworkSecurityException(e, sys)


# @dataclass 
# class TrainingPipelineConfig:
#     TRAIN_DATA_PATH:str 
#     TEST_DATA:str 

# data_ingestion
import os
import sys
from sklearn.model_selection import train_test_split
import pandas as pd
from Network_Security.exception.exception import NetworkSecurityException
from Network_Security.logging.logger import logging
from Network_Security.entity.config import Data_ingestion_Config
from Network_Security.entity.artifact import Data_Ingestion_Artifact
from Network_Security.configeration.mongodb import MongoDBClient  
from Network_Security.data_acess.networkdata_acess import NetworkData 

class Data_Ingestion:
    def __init__(self, ingestion_config: Data_ingestion_Config):
        try:
            self.ingestion_config = ingestion_config
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def get_feature_extract_data(self):
        try:
            logging.info("Extracting data from MongoDB...")
            networkdata = NetworkData()
            
            dataframe = networkdata.get_dataframe(
                collection_name=self.ingestion_config.data_ingestion_collection_path
            )
            # start feature_store
            feature_data_path = self.ingestion_config.data_ingestion_feature_path
            os.makedirs(os.path.dirname(feature_data_path), exist_ok=True)
            dataframe.to_csv(feature_data_path, index=False, header=True)
            logging.info(f"Data stored at {feature_data_path}")
            return dataframe
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def split_data(self, dataframe: pd.DataFrame):
        try:
            train_data, test_data = train_test_split(
                dataframe, 
                test_size=self.ingestion_config.split_ratio
            )

            train_file_path = self.ingestion_config.train_data_path
            os.makedirs(os.path.dirname(train_file_path), exist_ok=True)
            train_data.to_csv(train_file_path, index=False, header=True)

            test_file_path = self.ingestion_config.test_data_path
            os.makedirs(os.path.dirname(test_file_path), exist_ok=True)
            test_data.to_csv(test_file_path, index=False, header=True)

            logging.info("Train & Test datasets saved successfully.")
            return train_data, test_data
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def init_data_ingestion(self):
        try:
            dataframe = self.get_feature_extract_data()
            print(dataframe.head())
            self.split_data(dataframe)

            data_ingestion_artifact = Data_Ingestion_Artifact(
                train_file_path=self.ingestion_config.train_data_path,
                test_file_path=self.ingestion_config.test_data_path
            )
            logging.info("Data Ingestion completed successfully.")
            return data_ingestion_artifact
        except Exception as e:
            raise NetworkSecurityException(e, sys) 
        

from Network_Security.components.data_ingestion import Data_Ingestion
from Network_Security.components.data_validation import Data_validation
from Network_Security.entity.config import (Data_ingestion_Config,
                                            Data_validation_config) 
from Network_Security.entity.artifact import (Data_Ingestion_Artifact,
                                              Data_validation_Artifact)



class Training_Pipeline:
    def __init__(self):
        self.data_ingestion_config = Data_ingestion_Config()

    def start_data_ingestion(self)->Data_Ingestion_Artifact:
        data_ingestion = Data_Ingestion(ingestion_config=self.data_ingestion_config)
        data_ingestion_artifact = data_ingestion.init_data_ingestion()
        return data_ingestion_artifact 

    def run_pipeline(self)->None:
        data_ingestion_artifact = self.start_data_ingestion()
        

        return None   

from Network_Security.pipeline.train_pipeline import Training_Pipeline
from Network_Security.logging.logger import logging
from Network_Security.exception.exception import NetworkSecurityException
import sys 

if __name__ == '__main__':
        logging.info('Starting Training Pipeline...')
        pipeline = Training_Pipeline()

        # Data Ingestion
        logging.info('>>> Starting Data Ingestion')
        data_ingestion_artifact = pipeline.start_data_ingestion()
        logging.info(f'>>> Data Ingestion Completed: {data_ingestion_artifact}')



# Data_validation

In [None]:
# constant.__init__.py
DATA_VALIDATION_DIR:str = 'data_validation'
DATA_VALIDATION_REPORT_DIR:str = 'drift_report'
DATA_VALIDATION_REPORT_YAML:str = 'report.yaml'
SEHEMA_FILE_PATH = os.path.join('data_schema','column.yaml')

# config.py
from dataclasses import dataclass 
from datetime import datetime
from Network_Security.constant import *
TIMESTAMP = datetime.now().strftime('%m_%d_%Y_%H_%M_%S')

@dataclass 
class NS_Train_Configeration:
    artifact_dir:str = os.path.join(ARTIFACTS,TIMESTAMP)
    pipeline_dir:str = PIPELINE_DIR
    TIMESTAMP:str = TIMESTAMP

train_config = NS_Train_Configeration()
class Data_validation_config:
    data_validation_dir = os.path.join(train_config.artifact_dir,DATA_VALIDATION_DIR)
    data_validation_report = os.path.join(data_validation_dir,DATA_VALIDATION_REPORT_DIR,DATA_VALIDATION_REPORT_YAML)

# artifact.py
@dataclass 
class Data_validation_Artifact:
    validation_status:bool 
    message_error:str 
    drift_report_file_path:str

from Network_Security.logging.logger import logging
from Network_Security.constant import SEHEMA_FILE_PATH
from Network_Security.utils import read_yaml_file, write_yaml_file
from Network_Security.entity.artifact import Data_Ingestion_Artifact, Data_validation_Artifact
from Network_Security.entity.config import Data_validation_config
from Network_Security.exception.exception import NetworkSecurityException
from evidently import Report
from evidently.presets import DataDriftPreset
import pandas as pd
import json
import sys

# data_validation.py
class Data_validation:
    def __init__(self, data_ingestion_artifact: Data_Ingestion_Artifact,
                data_validation_config: Data_validation_config):
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_validation_config = data_validation_config
            self._schema_yaml = read_yaml_file(file_path=SEHEMA_FILE_PATH)
            if self._schema_yaml is None:
                raise ValueError(f"Schema file not loaded or is empty: {SEHEMA_FILE_PATH}")
        except Exception as e:
            raise NetworkSecurityException(e,sys)
        
    #if number of columns matches schema:
    def valid_no_columns(self, dataframe: pd.DataFrame) -> bool:
        try:
            expected_columns = self._schema_yaml['columns']
            status = len(dataframe.columns) == len(expected_columns)
            return status
        except Exception as e:
            raise NetworkSecurityException(e,sys)

    #if all expected columns exist:
    def is_column_exists(self, dataframe: pd.DataFrame) -> bool:
        try:
            missing_num_columns = [col for col in self._schema_yaml['numeric_columns'] if col not in dataframe.columns]
            missing_cat_columns = [col for col in self._schema_yaml['categorical_columns'] if col not in dataframe.columns]

            if missing_num_columns:
                logging.info(f'Missing numeric columns: {missing_num_columns}')
            if missing_cat_columns:
                logging.info(f'Missing categorical columns: {missing_cat_columns}')

            status = not (len(missing_num_columns) > 0 or len(missing_cat_columns) > 0)
            return status
        except Exception as e:
            raise NetworkSecurityException(e,sys)

    def detect_dataset_drift(self, reference_df: pd.DataFrame, current_df: pd.DataFrame) -> bool:
        try:
            report = Report([DataDriftPreset()],include_tests="True")
            report = report.run(reference_data=reference_df, current_data=current_df)
            report.save_html("data_drift_report.html")
            json_report = report.json()
            report_dict = json.loads(json_report)
            write_yaml_file(
                file_path=self.data_validation_config.data_validation_report,
                content=report_dict)
            
            n_features = sum(1 for m in report_dict["metrics"] if "ValueDrift" in m["metric_id"])
            drift_metric = next(m for m in report_dict["metrics"] if "DriftedColumnsCount" in m["metric_id"])
            n_drifted_features = drift_metric["value"]["count"]
            # Dataset drift status
            drift_status = n_drifted_features > 0
            print(n_features, n_drifted_features, drift_status)
            logging.info(f"{n_drifted_features}/{n_features} features show drift.")
            return drift_status    
        except Exception as e:
            logging.info(f"Error in dataset drift detection: {e}")
            raise NetworkSecurityException (e,sys)
  
    # Static method to read CSV
    @staticmethod
    def read_data(file_path: str) -> pd.DataFrame:
        return pd.read_csv(file_path)
    
    def init_data_validation(self) -> Data_validation_Artifact:
        try:
            valid_message_error = []
            # Read train and test data
            train_data = self.read_data(self.data_ingestion_artifact.train_file_path)
            test_data = self.read_data(self.data_ingestion_artifact.test_file_path)
            # train data
            if not self.valid_no_columns(train_data):
                valid_message_error.append('Error: Column Mismatch in train data')
            if not self.is_column_exists(train_data):
                valid_message_error.append('Error: Missing columns in train data')
            #test data
            if not self.valid_no_columns(test_data):
                valid_message_error.append('Error: Column Mismatch in test data')
            if not self.is_column_exists(test_data):
                valid_message_error.append('Error: Missing columns in test data')

            # Drift detection
            validation_status = len(valid_message_error) == 0
            if validation_status:
                drift_status = self.detect_dataset_drift(train_data, test_data)
                if drift_status:
                    valid_message_error.append('Drift detected')
                else:
                    valid_message_error.append('Drift not detected')
            else:
                logging.info(f'Validation errors: {valid_message_error}')

            #Create artifact
            data_validation_artifact = Data_validation_Artifact(
                validation_status=validation_status,
                message_error=valid_message_error,
                drift_report_file_path=self.data_validation_config.data_validation_report
            )
            return data_validation_artifact

        except Exception as e:
            raise NetworkSecurityException(e, sys)

# train_pipeline.py 
from Network_Security.components.data_ingestion import Data_Ingestion
from Network_Security.components.data_validation import Data_validation
from Network_Security.entity.config import (Data_ingestion_Config,
                                            Data_validation_config) 
from Network_Security.entity.artifact import (Data_Ingestion_Artifact,
                                              Data_validation_Artifact)
class Training_Pipeline:
    def __init__(self):
        self.data_ingestion_config = Data_ingestion_Config()
        self.data_validation_config = Data_validation_config()


    def start_data_ingestion(self)->Data_Ingestion_Artifact:
        data_ingestion = Data_Ingestion(ingestion_config=self.data_ingestion_config)
        data_ingestion_artifact = data_ingestion.init_data_ingestion()
        return data_ingestion_artifact 
    
    def start_data_validation(self, data_ingestion_artifact: Data_Ingestion_Artifact) -> Data_validation_Artifact:
        data_valid = Data_validation(data_ingestion_artifact=data_ingestion_artifact,
                                    data_validation_config=self.data_validation_config)
        data_validation_artifact = data_valid.init_data_validation()
        return data_validation_artifact




    def run_pipeline(self)->None:
        data_ingestion_artifact = self.start_data_ingestion()
        data_validation_artifact=self.start_data_validation(data_ingestion_artifact)

        return None

# app.py 
from Network_Security.pipeline.train_pipeline import Training_Pipeline
from Network_Security.logging.logger import logging
from Network_Security.exception.exception import NetworkSecurityException
import sys 

if __name__ == '__main__':
    try:
        logging.info('Starting Training Pipeline...')
        pipeline = Training_Pipeline()

        # Data Ingestion
        logging.info('>>> Starting Data Ingestion')
        data_ingestion_artifact = pipeline.start_data_ingestion()
        logging.info(f'>>> Data Ingestion Completed: {data_ingestion_artifact}')

        # Data Validation
        logging.info('>>> Starting Data Validation')
        data_validation_artifact = pipeline.start_data_validation(data_ingestion_artifact)
        logging.info(f'>>> Data Validation Completed: {data_validation_artifact}')

        logging.info('Pipeline finished successfully')
        
    except Exception as e:
        raise NetworkSecurityException(e, sys)




# Data_transformation

In [None]:
from dataclasses import dataclass 
from datetime import datetime
from Network_Security.constant import *
TIMESTAMP = datetime.now().strftime('%m_%d_%Y_%H_%M_%S')

@dataclass 
class NS_Train_Configeration:
    artifact_dir:str = os.path.join(ARTIFACTS,TIMESTAMP)
    pipeline_dir:str = PIPELINE_DIR
    TIMESTAMP:str = TIMESTAMP

train_config = NS_Train_Configeration()

@dataclass 
class Data_Transformation_Config:
    data_transformation_dir = os.path.join(train_config.artifact_dir,DATA_TRANSFORMATION_DIR)
    data_transformation_train_file = os.path.join(data_transformation_dir,DATA_TRANSFORMATION_TRANSFORM_FILE,TRAIN_DATA.replace('csv','npy'))
    data_transformation_test_file = os.path.join(data_transformation_dir,DATA_TRANSFORMATION_TRANSFORM_FILE,TEST_DATA.replace('csv','npy'))
    data_transformation_object_pkl = os.path.join(data_transformation_dir,DATA_TRANSFORMATION_TRANSFORM_0BJECT_FILE,PREPROCESSING_FILE)

@dataclass 
class Data_Transformation_Artifact:
    transform_object:str
    transform_train_file:str 
    transform_test_file:str 


from Network_Security.constant import * 
from Network_Security.exception.exception import NetworkSecurityException
from Network_Security.utils import read_yaml_file, save_object, save_numpy_array
from Network_Security.entity.artifact import (
    Data_Ingestion_Artifact,
    Data_validation_Artifact,
    Data_Transformation_Artifact
)
from Network_Security.entity.config import Data_Transformation_Config

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
import pandas as pd 
import numpy as np
import sys

class DataTransformation:
    def __init__(self,
                 data_ingestion_artifact: Data_Ingestion_Artifact,
                 data_validation_artifact: Data_validation_Artifact,
                 data_transformation_config: Data_Transformation_Config):
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_validation_artifact = data_validation_artifact
            self.data_transformation_config = data_transformation_config
            self._schema_config = read_yaml_file(SCHEMA_FILE_PATH)
        except Exception as e:
            raise NetworkSecurityException(e, sys)
    
    def get_data_transformation(self) -> Pipeline:
        try:
            imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
            processor = Pipeline([('imputer', imputer)])
            return processor
        except Exception as e:
            raise NetworkSecurityException(e, sys)
        
    @staticmethod
    def read_data(file_path: str) -> pd.DataFrame:
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise NetworkSecurityException(e, sys)
    
    def init_data_transformation(self):
        try:
            train_df = DataTransformation.read_data(self.data_ingestion_artifact.train_file_path)
            test_df = DataTransformation.read_data(self.data_ingestion_artifact.test_file_path)

            # Train features & target
            input_feature_train = train_df.drop(columns=[TARGET_COLUMN], axis=1)
            target_feature_train = train_df[TARGET_COLUMN].replace(-1, 0)

            # Test features & target
            input_feature_test = test_df.drop(columns=[TARGET_COLUMN], axis=1)
            target_feature_test = test_df[TARGET_COLUMN].replace(-1, 0)

            # Preprocessor
            preprocessor = self.get_data_transformation()
            input_feature_train_arr = preprocessor.fit_transform(input_feature_train)
            input_feature_test_arr = preprocessor.transform(input_feature_test)

            # Combine arrays
            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test)]

            # Save transformation pipeline and arrays
            save_object(self.data_transformation_config.data_transformation_object_pkl, obj=preprocessor)
            save_numpy_array(self.data_transformation_config.data_transformation_train_file, array=train_arr)
            save_numpy_array(self.data_transformation_config.data_transformation_test_file, array=test_arr)

            data_transformation_artifact = Data_Transformation_Artifact(
                transform_object=self.data_transformation_config.data_transformation_object_pkl,
                transform_train_file=self.data_transformation_config.data_transformation_train_file,
                transform_test_file=self.data_transformation_config.data_transformation_test_file
            )

            return data_transformation_artifact

        except Exception as e:
            raise NetworkSecurityException(e, sys)


# Practice Preprocessing

In [None]:
import os
import yaml
import pickle

def read_yaml_file(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'rb') as file:
        yaml.safe_load(file) 

def read_data(df:pd.DataFrame):
    return pd.read_csv(df)

def drop_col(df:pd.DataFrame,col):
    return df.drop(columns=[col],axis=1,inplace=True)

class TargetValueMapping:
    def __init__(self):
        self.male = 0
        self.female = 1 
    def _asdict(self):
        return self.__dict__ 
    def reverse(self):
        return dict(zip(self._asdict().values(), self._asdict().keys()))
    
def save_numpy_array(file_path: str, array: np.array):
        dir_path = os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        with open(file_path, 'wb') as file_obj:
            np.save(file_obj, array)

def save_object(file_path: str,obj):
        dir_path = os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        with open(file_path, 'wb') as file_obj:
            pickle.dump(file_obj,obj)
    


In [None]:
from dataclasses import dataclass 
import os 
from datetime import datetime 
import pandas as pd 
import numpy as np 


@dataclass
class Data_Ingestion_Artifact:
    train_file_path:str 
    test_file_path:str

@dataclass
class Data_validation_Artifact:
    validation_status:bool
    message: str
    drift_report_file_path: str

@dataclass
class DataTransformationArtifact:
    transformed_object_file_path:str 
    transformed_train_file_path:str
    transformed_test_file_path:str

# Artifacts
ARTIFACTS = 'artifacts'
PIPELINE_DIR = 'network'
@dataclass 
class NS_Train_Configeration:
    artifact_dir:str = ARTIFACTS
    pipeline_dir:str = PIPELINE_DIR
    TIMESTAMP = datetime.now().strftime('%m_%d_%Y_%H_%M_%S')

train_config = NS_Train_Configeration() 

DATA_TRANSFORMATION_DIR:str = 'data_transformation'
DATA_TRANSFORMATION_TRANSFORM_FILE:str = 'transform'
DATA_TRANSFORMATION_OBJECT_DIR:str = 'transform_object' 

# data
RAW_DATA = 'raw.csv'
TRAIN_DATA = 'train.csv'
TEST_DATA = 'test.csv'
PREPROCESSOR_FILE = 'preprocessor.pkl'
TARGET_COLUMN = '------------------------------------------'
CURRENT_DATE = datetime.now()

class Data_Transformation_config:
    data_transformation_dir = os.path.join(train_config.artifact_dir,DATA_TRANSFORMATION_DIR)
    data_transformation_train_file = os.path.join(data_transformation_dir,DATA_TRANSFORMATION_TRANSFORM_FILE,TRAIN_DATA.replace('csv','npy'))
    data_transformation_test_file = os.path.join(data_transformation_dir,DATA_TRANSFORMATION_TRANSFORM_FILE,TEST_DATA.replace('csv','npy'))
    data_transformation_object = os.path.join(data_transformation_dir,DATA_TRANSFORMATION_OBJECT_DIR,PREPROCESSOR_FILE)

from Network_Security.constant import SCHEMA_FILE_PATH
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,PowerTransformer
from sklearn.compose import ColumnTransformer
@dataclass
class DataTransformationArtifact:
    transformed_object_file_path:str 
    transformed_train_file_path:str
    transformed_test_file_path:str

class Data_Transformation:
    def __init__(self,data_ingestion_artifact=Data_Ingestion_Artifact,
                 data_validation_artifact=Data_validation_Artifact,
                 data_transformation_config=Data_Transformation_config):
        self.data_ingestion_artifact = data_ingestion_artifact
        self.data_validation_artifact= data_validation_artifact
        self.data_transformation_config= data_transformation_config
        self._sehema = read_yaml_file(file_path=SCHEMA_FILE_PATH)
    def get_data_transformation(self):
        ohe_transform = OneHotEncoder()
        or_transform = OrdinalEncoder()
        scaler = StandardScaler()
        pw_transform = PowerTransformer(method='yeo-johnson')

        ohe_col = self._sehema['ohe_columns']
        or_col = self._sehema['or_columns']
        num_col = self._sehema['numerical_columns']
        power_tf_col = self._sehema['transform_columns']

        preprocessor = ColumnTransformer([
            ('OneHotEncoder',ohe_transform,ohe_col),
            ('OrdinalEncoder',or_transform,or_col),
            ('PowerTransformer',pw_transform,power_tf_col),
            ('StandardScaler',scaler,num_col)

        ])
        return preprocessor 
    @staticmethod
    def read_data(dataframe:pd.DataFrame):
        return pd.read_csv(dataframe)

    def init_data_transformation(self):
        if self.data_validation_artifact.validation_status:
            preprocessor = self.get_data_transformation()
            train_df = Data_Transformation.read_data(self.data_ingestion_artifact.train_file_path)
            test_df = Data_Transformation.read_data(self.data_ingestion_artifact.test_file_path)
            
            xtrain = train_df.drop(columns=[TARGET_COLUMN],axis=1)
            ytrain = train_df[TARGET_COLUMN]

            xtrain['company_age'] = CURRENT_DATE - xtrain['company_estabilish']
            drop_col = self._sehema['drop_col']
            xtrain=drop_col(xtrain,drop_col)

            ytrain = ytrain.replace(TargetValueMapping()._asdict())

            #test-df
            xtest = test_df.drop(columns=[TARGET_COLUMN],axis=1)
            ytest = test_df[TARGET_COLUMN]

            xtest['company_age'] = CURRENT_DATE - xtest['company_estabilish']
            drop_col = self._sehema['drop_col']
            xtest=drop_col(xtest,drop_col)

            ytest = ytest.replace(TargetValueMapping()._asdict())

            xtrain_arr = preprocessor.fit_transform(xtrain)
            xtest_arr = preprocessor.transform(xtest)

            from imblearn.combine import SMOTEENN 
            smt = SMOTEENN(sampling_strategy="minority")
            xtrain_arr,ytrain = smt.fit_resample(xtrain_arr,ytrain)

            smt = SMOTEENN(sampling_strategy="minority")
            xtest_arr,ytest = smt.fit_resample(xtest_arr,ytest)

            train_arr = np.c_[xtrain_arr,np.array(ytrain)]
            test_arr = np.c_[xtest_arr,np.array(ytest)]

            save_object(self.data_transformation_config.data_transformation_object,preprocessor)
            save_numpy_array(self.data_transformation_config.data_transformation_train_file,array=train_arr)
            save_numpy_array(self.data_transformation_config.data_transformation_train_file,array=test_arr)

            return DataTransformationArtifact(transformed_object_file_path= self.data_transformation_config.transformed_object_file_path,
                                                transformed_train_file_path= self.data_transformation_config.transformed_train_file_path,
                                                transformed_test_file_path= self.data_transformation_config.transformed_test_file_path
                                                )
                

class Training_Pipeline:
    def __init__(self):
    #   self.data_ingestion_config = Data_ingestion_Config()
    #   self.validation_config = Data_validation_config()
        self.data_transformation_config= Data_Transformation_config()



    # def start_data_ingestion(self)->Data_Ingestion_Artifact:
    #     data_ingestion = Data_Ingestion(ingestion_config=self.data_ingestion_config)
    #     data_ingestion_artifacet = data_ingestion.init_data_ingestion()
    #     return data_ingestion_artifacet

    # def start_data_validation(self,data_ingestion_artifacet:Data_Ingestion_Artifact)-> Data_validation_Artifact:
    #     data_valid = Data_validation(data_ingestion_artifacet=data_ingestion_artifacet,
    #                                   data_validation_config=self.data_validation_config)
    #     data_validation_Artifact = data_valid.init_data_ingestion()
    #     return data_validation_Artifact

    def start_data_transform(self,data_ingestion_artifacet:Data_Ingestion_Artifact,
                             data_validation_Artifact:Data_validation_Artifact)->DataTransformationArtifact:
        data_transform_config = Data_Transformation(
                                                    data_ingestion_artifacet=data_ingestion_artifacet,
                                                    data_validation_Artifact=data_validation_Artifact,
                                                    data_transformation_config=self.data_transformation_config
                                                    )
        data_transform_artifact = data_transform_config.init_data_transformation()
        return data_transform_artifact

# Model_Trainer

In [None]:
# constants
MODEL_TRAINER_DIR: str = 'model_trainer'
MODEL_TRAINER_FILE_NAME: str = 'trained_model'
MODEL_TRAINER_TRAINED_MODEL_NAME: str = 'model.pkl'
MODEL_TRAINER_CONFIG_PARAM_PATH: str = os.path.join('data_schema', 'best_param.yaml')
MODEL_TRAINER_EXCEPTED_RATIO: float = 0.6

# config
from dataclasses import dataclass

@dataclass 
class Model_Trainer_Config:
    model_trainer_dir = os.path.join(train_config.artifact_dir, MODEL_TRAINER_DIR)
    model_trained_path = os.path.join(model_trainer_dir, MODEL_TRAINER_FILE_NAME, MODEL_TRAINER_TRAINED_MODEL_NAME)
    model_trained_config_param_path = MODEL_TRAINER_CONFIG_PARAM_PATH
    excepted_ratio = MODEL_TRAINER_EXCEPTED_RATIO
    metrics_artifact = Metrics_Artifact

# artifact
@dataclass 
class Metrics_Artifact:
    f1_score: float
    accuracy_score: float
    recall_score: float
    precision_score: float

@dataclass 
class Model_Trainer_Artifact:
    model_pkl: str
    metrics_artifact: Metrics_Artifact


# model_trainer
from Network_Security.entity.artifact import (Data_Transformation_Artifact,
                                              Metrics_Artifact,
                                              Model_Trainer_Arifact)
from Network_Security.entity.config import Model_Trainer_Config
from Network_Security.utils import load_numpy_array,load_object,save_object
from Network_Security.logging.logger import logging
from Network_Security.exception.exception import NetworkSecurityException

from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.pipeline import Pipeline
from neuro_mf import ModelFactory
from typing import Tuple
import numpy as np 
import pandas as pd
import sys



class Network_model:
    def __init__(self, transform_object: Pipeline, best_model_details: object)->Tuple[object,object]:
        self.transform_object = transform_object
        self.best_model_details = best_model_details
    def predict(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        try:
            transformed_features = self.transform_object.transform(dataframe)
            predictions = self.best_model_details.predict(transformed_features)

            return pd.DataFrame(predictions, columns=['prediction'])
        except Exception as e:
            raise NetworkSecurityException(e,sys)

class Model_Train:
    def __init__(self, data_transformation_artifact: Data_Transformation_Artifact,
                 model_trainer_config: Model_Trainer_Config):
        self.data_transformation_artifact = data_transformation_artifact
        self.model_trainer_config = model_trainer_config
    
    def get_best_model_indentify(self, train_arr: np.array, test_arr: np.array):
        try:
            model_factory = ModelFactory(self.model_trainer_config.model_trained_config_param_path)
        
            xtrain, ytrain = train_arr[:, :-1], train_arr[:, -1]
            xtest, ytest = test_arr[:, :-1], test_arr[:, -1]

            best_model_details = model_factory.get_best_model(
            X=xtrain,
            y=ytrain,
            base_accuracy=self.model_trainer_config.excepted_ratio)
            
            best_model = best_model_details.best_model
            print(best_model)
            pred = best_model.predict(xtest)

            acc = accuracy_score(ytest, pred)
            f1 = f1_score(ytest, pred)
            recall = recall_score(ytest, pred)
            precision = precision_score(ytest, pred)
            
            metrics_artifact = Metrics_Artifact(f1_score=f1, accuracy_score=acc, recall_score=recall, precision_score=precision)
            print(metrics_artifact)
            print(best_model_details.best_score)
            print(best_model_details.best_parameters)
            
            return best_model_details, metrics_artifact
        except Exception as e:
            raise NetworkSecurityException(e,sys)
    
    def init_best_model(self):
        try:
            train_arr = load_numpy_array(self.data_transformation_artifact.transform_train_file)
            test_arr = load_numpy_array(self.data_transformation_artifact.transform_test_file)

            best_model_details, metrics_artifact = self.get_best_model_indentify(train_arr, test_arr)
            transform_object = load_object(self.data_transformation_artifact.transform_object)
        
            if best_model_details.best_score < self.model_trainer_config.excepted_ratio:
                logging.info("Best model not found with expected accuracy.")

            network_model_obj = Network_model(transform_object, best_model_details)
            save_object(self.model_trainer_config.model_trained_path, network_model_obj)

            model_trainer_artifact = Model_Trainer_Arifact(
                model_pkl=self.model_trainer_config.model_trained_path,
                metrics=metrics_artifact
            )

            return model_trainer_artifact
        except Exception as e:
                raise NetworkSecurityException(e,sys)


def strat_model_trainer(self,data_transformation_artifact:Data_Transformation_Artifact):
        try:
            model_train = Model_Train(data_transformation_artifact=data_transformation_artifact,
                                    model_trainer_config=self.model_trainer_config)
            model_trainer_artifact=model_train.init_best_model()
            return model_trainer_artifact
        except Exception as e:
            raise NetworkSecurityException(e,sys)

def run_pipeline(self)->None:
    try:
        data_ingestion_artifact = self.start_data_ingestion()
        data_validation_artifact=self.start_data_validation(data_ingestion_artifact)
        data_transformation_artifact = self.start_data_transformation(data_ingestion_artifact,data_validation_artifact)
        model_trainer_artifact = self.strat_model_trainer(data_transformation_artifact)
    except Exception as e:
        raise NetworkSecurityException(e,sys)

    return None

from Network_Security.pipeline.train_pipeline import Training_Pipeline
from Network_Security.logging.logger import logging
from Network_Security.exception.exception import NetworkSecurityException
import sys 

if __name__ == '__main__':
    try:
        logging.info('Starting Training Pipeline...')
        pipeline = Training_Pipeline()

        # Data Ingestion
        logging.info('>>> Starting Data Ingestion')
        data_ingestion_artifact = pipeline.start_data_ingestion()
        logging.info(f'>>> Data Ingestion Completed: {data_ingestion_artifact}')

        # Data Validation
        logging.info('---------->>> Starting Data Validation-------------->>>')
        data_validation_artifact = pipeline.start_data_validation(data_ingestion_artifact)
        logging.info(f'>>> Data Validation Completed: {data_validation_artifact}')

        # Data Transformation
        logging.info('>>> Starting Data Transformation')
        data_transformation_artifact = pipeline.start_data_transformation(data_ingestion_artifact,data_validation_artifact)
        logging.info(f'>>> Data Transformation Completed: {data_transformation_artifact}')

        #Model Trainer
        logging.info('---------->>> Starting Model Trainer -------------->>>')
        model_trainer_artifact = pipeline.strat_model_trainer(data_transformation_artifact)
        logging.info(f'>>> Model Trainer Completed: {model_trainer_artifact}')


        logging.info('Pipeline finished successfully')
        
    except Exception as e:
        raise NetworkSecurityException(e, sys)