# DataIngestion

In [None]:
from dataclasses import dataclass
from datetime import datetime
import os
import numpy as np 
import os
import sys
from sklearn.model_selection import train_test_split
import pandas as pd


DATABASE_NAME = 'NETWORK_SECURITY'
COLLECTION_NAME = 'NETWORK_DATA' 
MONGODB_URL = 'MONGODB_URL'

ARTIFACTS_DIR = 'artifacts'
PIPELINE_DIR = 'security'

DATA_INGESTION_DIR_NAME : str = 'data_ingestion'
DATA_INGESTION_COLLECTION_NAME: str = 'NETWORK_DATA'
DATA_INGESTION_FEATURE_STORED_NAME:str = 'feature'
DATA_INGESTION_INGESTED_NAME:str = 'ingested'
DATA_INGESTION_SPLIT_RATIO:float = 0.2 

RAW_DATA = 'security.csv'
TRAIN_DATA = 'train.csv'
TEST_DATA = 'test.csv'

TIMESTAMP = datetime.now().strftime('%m_%d_%Y_%H_%M_%S')

@dataclass 
class TrainingConfiguration:
    artifact_dir:str = ARTIFACTS_DIR 
    piprline_dir:str = PIPELINE_DIR 
    timestamp:str = TIMESTAMP 

trainingconfig : TrainingConfiguration=TrainingConfiguration()

@dataclass 
class Data_ingestion_configeration:
    data_ingestion_dir:str = os.path.join(trainingconfig.artifact_dir,DATA_INGESTION_DIR_NAME)
    data_ingestion_collection:str = DATA_INGESTION_COLLECTION_NAME 
    data_ingestion_feature:str = os.path.join(data_ingestion_dir,DATA_INGESTION_FEATURE_STORED_NAME,RAW_DATA)
    train_data_path:str = os.path.join(data_ingestion_dir,DATA_INGESTION_INGESTED_NAME,TRAIN_DATA)
    test_data_path:str = os.path.join(data_ingestion_dir,DATA_INGESTION_INGESTED_NAME,TEST_DATA)
    split_ratio:float = DATA_INGESTION_SPLIT_RATIO



# mongodb
from Network_Security.constant import MONGOBD_URL, DATA_BASE_NAME
from Network_Security.exception.exception import NetworkSecurityException
from Network_Security.logging.logger import logging
from dotenv import load_dotenv 
import certifi 
import pymongo
import sys
import os 


load_dotenv()
# MONGOBD_URL = os.getenv("MONGOBD_URL")
ca = certifi.where()  

class MongoDBClient:
    def __init__(self, database=DATA_BASE_NAME):
        try:
            mongo_url = os.getenv(MONGOBD_URL)
            if mongo_url is None:
                logging.info("MongoDB URL not found in environment variables")
                raise ValueError("MongoDB URL is missing")

            MongoDBClient.client = pymongo.MongoClient(mongo_url, tlsCAFile=ca)
            self.client = MongoDBClient.client 
            self.database = self.client[database]
            self.database_name = database  

        except Exception as e:
            raise NetworkSecurityException(e, sys)
 

#networkdata_acces
from Network_Security.exception.exception import NetworkSecurityException
from Network_Security.logging.logger import logging
from Network_Security.configeration.mongodb import MongoDBClient 
from typing import Optional
import pandas as pd
import numpy as np 
import sys


class NetworkData:
    def __init__(self):
        try:
            self.mongo_client = MongoDBClient()   
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def get_dataframe(self, collection_name: str, database_name: Optional[str] = None)->pd.DataFrame:
        try:
            if database_name:
                collection = self.mongo_client.client[database_name][collection_name]
            else:
                collection = self.mongo_client.database[collection_name]

            df = pd.DataFrame(list(collection.find()))
            if "_id" in df.columns:
                df.drop(columns=["_id"], inplace=True)
            df.replace("na", np.nan, inplace=True)

            logging.info("DataFrame Extract Successful")
            return df

        except Exception as e:
            raise NetworkSecurityException(e, sys)


# @dataclass 
# class TrainingPipelineConfig:
#     TRAIN_DATA_PATH:str 
#     TEST_DATA:str 

# data_ingestion
import os
import sys
from sklearn.model_selection import train_test_split
import pandas as pd
from Network_Security.exception.exception import NetworkSecurityException
from Network_Security.logging.logger import logging
from Network_Security.entity.config import Data_ingestion_Config
from Network_Security.entity.artifact import Data_Ingestion_Artifact
from Network_Security.configeration.mongodb import MongoDBClient  
from Network_Security.data_acess.networkdata_acess import NetworkData 

class Data_Ingestion:
    def __init__(self, ingestion_config: Data_ingestion_Config):
        try:
            self.ingestion_config = ingestion_config
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def get_feature_extract_data(self):
        try:
            logging.info("Extracting data from MongoDB...")
            networkdata = NetworkData()
            
            dataframe = networkdata.get_dataframe(
                collection_name=self.ingestion_config.data_ingestion_collection_path
            )
            # start feature_store
            feature_data_path = self.ingestion_config.data_ingestion_feature_path
            os.makedirs(os.path.dirname(feature_data_path), exist_ok=True)
            dataframe.to_csv(feature_data_path, index=False, header=True)
            logging.info(f"Data stored at {feature_data_path}")
            return dataframe
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def split_data(self, dataframe: pd.DataFrame):
        try:
            train_data, test_data = train_test_split(
                dataframe, 
                test_size=self.ingestion_config.split_ratio
            )

            train_file_path = self.ingestion_config.train_data_path
            os.makedirs(os.path.dirname(train_file_path), exist_ok=True)
            train_data.to_csv(train_file_path, index=False, header=True)

            test_file_path = self.ingestion_config.test_data_path
            os.makedirs(os.path.dirname(test_file_path), exist_ok=True)
            test_data.to_csv(test_file_path, index=False, header=True)

            logging.info("Train & Test datasets saved successfully.")
            return train_data, test_data
        except Exception as e:
            raise NetworkSecurityException(e, sys)

    def init_data_ingestion(self):
        try:
            dataframe = self.get_feature_extract_data()
            print(dataframe.head())
            self.split_data(dataframe)

            data_ingestion_artifact = Data_Ingestion_Artifact(
                train_file_path=self.ingestion_config.train_data_path,
                test_file_path=self.ingestion_config.test_data_path
            )
            logging.info("Data Ingestion completed successfully.")
            return data_ingestion_artifact
        except Exception as e:
            raise NetworkSecurityException(e, sys) 
        

from Network_Security.components.data_ingestion import Data_Ingestion
from Network_Security.components.data_validation import Data_validation
from Network_Security.entity.config import (Data_ingestion_Config,
                                            Data_validation_config) 
from Network_Security.entity.artifact import (Data_Ingestion_Artifact,
                                              Data_validation_Artifact)



class Training_Pipeline:
    def __init__(self):
        self.data_ingestion_config = Data_ingestion_Config()

    def start_data_ingestion(self)->Data_Ingestion_Artifact:
        data_ingestion = Data_Ingestion(ingestion_config=self.data_ingestion_config)
        data_ingestion_artifact = data_ingestion.init_data_ingestion()
        return data_ingestion_artifact 

    def run_pipeline(self)->None:
        data_ingestion_artifact = self.start_data_ingestion()
        

        return None   

from Network_Security.pipeline.train_pipeline import Training_Pipeline
from Network_Security.logging.logger import logging
from Network_Security.exception.exception import NetworkSecurityException
import sys 

if __name__ == '__main__':
        logging.info('Starting Training Pipeline...')
        pipeline = Training_Pipeline()

        # Data Ingestion
        logging.info('>>> Starting Data Ingestion')
        data_ingestion_artifact = pipeline.start_data_ingestion()
        logging.info(f'>>> Data Ingestion Completed: {data_ingestion_artifact}')



# Data_validation

In [None]:
# constant.__init__.py
DATA_VALIDATION_DIR:str = 'data_validation'
DATA_VALIDATION_REPORT_DIR:str = 'drift_report'
DATA_VALIDATION_REPORT_YAML:str = 'report.yaml'
SEHEMA_FILE_PATH = os.path.join('data_schema','column.yaml')

# config.py
from dataclasses import dataclass 
from datetime import datetime
from Network_Security.constant import *
TIMESTAMP = datetime.now().strftime('%m_%d_%Y_%H_%M_%S')

@dataclass 
class NS_Train_Configeration:
    artifact_dir:str = os.path.join(ARTIFACTS,TIMESTAMP)
    pipeline_dir:str = PIPELINE_DIR
    TIMESTAMP:str = TIMESTAMP

train_config = NS_Train_Configeration()
class Data_validation_config:
    data_validation_dir = os.path.join(train_config.artifact_dir,DATA_VALIDATION_DIR)
    data_validation_report = os.path.join(data_validation_dir,DATA_VALIDATION_REPORT_DIR,DATA_VALIDATION_REPORT_YAML)

# artifact.py
@dataclass 
class Data_validation_Artifact:
    validation_status:bool 
    message_error:str 
    drift_report_file_path:str

from Network_Security.logging.logger import logging
from Network_Security.constant import SEHEMA_FILE_PATH
from Network_Security.utils import read_yaml_file, write_yaml_file
from Network_Security.entity.artifact import Data_Ingestion_Artifact, Data_validation_Artifact
from Network_Security.entity.config import Data_validation_config
from Network_Security.exception.exception import NetworkSecurityException
from evidently import Report
from evidently.presets import DataDriftPreset
import pandas as pd
import json
import sys

# data_validation.py
class Data_validation:
    def __init__(self, data_ingestion_artifact: Data_Ingestion_Artifact,
                data_validation_config: Data_validation_config):
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_validation_config = data_validation_config
            self._schema_yaml = read_yaml_file(file_path=SEHEMA_FILE_PATH)
            if self._schema_yaml is None:
                raise ValueError(f"Schema file not loaded or is empty: {SEHEMA_FILE_PATH}")
        except Exception as e:
            raise NetworkSecurityException(e,sys)
        
    #if number of columns matches schema:
    def valid_no_columns(self, dataframe: pd.DataFrame) -> bool:
        try:
            expected_columns = self._schema_yaml['columns']
            status = len(dataframe.columns) == len(expected_columns)
            return status
        except Exception as e:
            raise NetworkSecurityException(e,sys)

    #if all expected columns exist:
    def is_column_exists(self, dataframe: pd.DataFrame) -> bool:
        try:
            missing_num_columns = [col for col in self._schema_yaml['numeric_columns'] if col not in dataframe.columns]
            missing_cat_columns = [col for col in self._schema_yaml['categorical_columns'] if col not in dataframe.columns]

            if missing_num_columns:
                logging.info(f'Missing numeric columns: {missing_num_columns}')
            if missing_cat_columns:
                logging.info(f'Missing categorical columns: {missing_cat_columns}')

            status = not (len(missing_num_columns) > 0 or len(missing_cat_columns) > 0)
            return status
        except Exception as e:
            raise NetworkSecurityException(e,sys)

    def detect_dataset_drift(self, reference_df: pd.DataFrame, current_df: pd.DataFrame) -> bool:
        try:
            report = Report([DataDriftPreset()],include_tests="True")
            report = report.run(reference_data=reference_df, current_data=current_df)
            report.save_html("data_drift_report.html")
            json_report = report.json()
            report_dict = json.loads(json_report)
            write_yaml_file(
                file_path=self.data_validation_config.data_validation_report,
                content=report_dict)
            
            n_features = sum(1 for m in report_dict["metrics"] if "ValueDrift" in m["metric_id"])
            drift_metric = next(m for m in report_dict["metrics"] if "DriftedColumnsCount" in m["metric_id"])
            n_drifted_features = drift_metric["value"]["count"]
            # Dataset drift status
            drift_status = n_drifted_features > 0
            print(n_features, n_drifted_features, drift_status)
            logging.info(f"{n_drifted_features}/{n_features} features show drift.")
            return drift_status    
        except Exception as e:
            logging.info(f"Error in dataset drift detection: {e}")
            raise NetworkSecurityException (e,sys)
  
    # Static method to read CSV
    @staticmethod
    def read_data(file_path: str) -> pd.DataFrame:
        return pd.read_csv(file_path)
    
    def init_data_validation(self) -> Data_validation_Artifact:
        try:
            valid_message_error = []
            # Read train and test data
            train_data = self.read_data(self.data_ingestion_artifact.train_file_path)
            test_data = self.read_data(self.data_ingestion_artifact.test_file_path)
            # train data
            if not self.valid_no_columns(train_data):
                valid_message_error.append('Error: Column Mismatch in train data')
            if not self.is_column_exists(train_data):
                valid_message_error.append('Error: Missing columns in train data')
            #test data
            if not self.valid_no_columns(test_data):
                valid_message_error.append('Error: Column Mismatch in test data')
            if not self.is_column_exists(test_data):
                valid_message_error.append('Error: Missing columns in test data')

            # Drift detection
            validation_status = len(valid_message_error) == 0
            if validation_status:
                drift_status = self.detect_dataset_drift(train_data, test_data)
                if drift_status:
                    valid_message_error.append('Drift detected')
                else:
                    valid_message_error.append('Drift not detected')
            else:
                logging.info(f'Validation errors: {valid_message_error}')

            #Create artifact
            data_validation_artifact = Data_validation_Artifact(
                validation_status=validation_status,
                message_error=valid_message_error,
                drift_report_file_path=self.data_validation_config.data_validation_report
            )
            return data_validation_artifact

        except Exception as e:
            raise NetworkSecurityException(e, sys)

# train_pipeline.py 
from Network_Security.components.data_ingestion import Data_Ingestion
from Network_Security.components.data_validation import Data_validation
from Network_Security.entity.config import (Data_ingestion_Config,
                                            Data_validation_config) 
from Network_Security.entity.artifact import (Data_Ingestion_Artifact,
                                              Data_validation_Artifact)
class Training_Pipeline:
    def __init__(self):
        self.data_ingestion_config = Data_ingestion_Config()
        self.data_validation_config = Data_validation_config()


    def start_data_ingestion(self)->Data_Ingestion_Artifact:
        data_ingestion = Data_Ingestion(ingestion_config=self.data_ingestion_config)
        data_ingestion_artifact = data_ingestion.init_data_ingestion()
        return data_ingestion_artifact 
    
    def start_data_validation(self, data_ingestion_artifact: Data_Ingestion_Artifact) -> Data_validation_Artifact:
        data_valid = Data_validation(data_ingestion_artifact=data_ingestion_artifact,
                                    data_validation_config=self.data_validation_config)
        data_validation_artifact = data_valid.init_data_validation()
        return data_validation_artifact




    def run_pipeline(self)->None:
        data_ingestion_artifact = self.start_data_ingestion()
        data_validation_artifact=self.start_data_validation(data_ingestion_artifact)

        return None

# app.py 
from Network_Security.pipeline.train_pipeline import Training_Pipeline
from Network_Security.logging.logger import logging
from Network_Security.exception.exception import NetworkSecurityException
import sys 

if __name__ == '__main__':
    try:
        logging.info('Starting Training Pipeline...')
        pipeline = Training_Pipeline()

        # Data Ingestion
        logging.info('>>> Starting Data Ingestion')
        data_ingestion_artifact = pipeline.start_data_ingestion()
        logging.info(f'>>> Data Ingestion Completed: {data_ingestion_artifact}')

        # Data Validation
        logging.info('>>> Starting Data Validation')
        data_validation_artifact = pipeline.start_data_validation(data_ingestion_artifact)
        logging.info(f'>>> Data Validation Completed: {data_validation_artifact}')

        logging.info('Pipeline finished successfully')
        
    except Exception as e:
        raise NetworkSecurityException(e, sys)


