In [42]:
# MongoDB
DATA_BASE_NAME = 'NETWORK_SECURITY'
COLLECTION_NAME = 'NETWORK_DATA'
MONGOBD_URL = 'MONGODB_URL'

# Artifacts
ARTIFACTS = 'artifacts'
PIPELINE_DIR = 'network'

# Data_ingestion
DATA_INGESTION_DIR: str = 'data_ingestion'
DATA_INGESTION_COLLECTION_NAME:str = 'NETWORK_DATA'
DATA_INGESTION_FEATURE_STORED_DIR:str = 'feature'
DATA_INGESTION_INGESTED_DIR:str = 'ingested'
DATA_INGESTION_SPLIT_RATIO:float = 0.2 

# DATA 
RAW_DATA = 'raw.csv'
TRAIN_DATA = 'train.csv'
TEST_DATA = 'test.csv'


In [43]:
from dataclasses import dataclass 
from datetime import datetime

@dataclass 
class NS_Train_Configeration:
    artifact_dir:str = ARTIFACTS
    pipeline_dir:str = PIPELINE_DIR
    TIMESTAMP = datetime.now().strftime('%m_%d_%Y_%H_%M_%S')

train_config = NS_Train_Configeration()
    

In [44]:
from dataclasses import dataclass
import os 


@dataclass 
class Data_ingestion_Config:
    data_ingestion_path = os.path.join(train_config.artifact_dir,DATA_INGESTION_DIR)
    data_ingestion_collection_path = DATA_INGESTION_COLLECTION_NAME 
    data_ingestion_feature_path = os.path.join(data_ingestion_path,DATA_INGESTION_FEATURE_STORED_DIR,RAW_DATA)
    train_data_path = os.path.join(data_ingestion_path,DATA_INGESTION_INGESTED_DIR,TRAIN_DATA)
    test_data_path = os.path.join(data_ingestion_path,DATA_INGESTION_INGESTED_DIR,TEST_DATA)
    split_ratio = DATA_INGESTION_SPLIT_RATIO 


@dataclass 
class Data_Ingestion_Artifact:
    train_file_path:str
    test_file_path:str



In [45]:
import pymongo
import certifi 
import sys 
import os

In [57]:
import os
import pymongo
import certifi
import logging
from dotenv import load_dotenv

load_dotenv()  

MONGODB_URL = os.getenv("MONGODB_URL")
#DATA_BASE_NAME = os.getenv("DATA_BASE_NAME")  

ca = certifi.where()

class MongoDBClient:
    client = None

    def __init__(self, database=DATA_BASE_NAME):
        try:
            if MongoDBClient.client is None:
                mongo_url = os.getenv("MONGODB_URL")
                if mongo_url is None:
                    logging.error("MONGODB_URL is missing in .env")
                    raise ValueError("MongoDB URL not found in environment variables")

                MongoDBClient.client = pymongo.MongoClient(mongo_url, tlsCAFile=ca)

            self.client = MongoDBClient.client
            self.database = self.client[database]
            self.database_name = database

        except Exception as e:
            raise e


In [61]:
from typing import Optional
import pandas as pd
import numpy as np

class NetworkData:
    try:
        def __init__(self):
            self.mongo_client = MongoDBClient()
        def get_dataframe(self,connection:str,database_name:Optional[str]=None):
            if database_name is None:
                connection = self.mongo_client.database[connection]
            else:
                connection = self.mongo_client[database_name][connection]
            df = pd.DataFrame(list(connection.find()))
            if '_id' in df.columns.to_list():
                df = df.drop('_id',axis=1)
            df.replace({'na',np.nan})
            return df
    except Exception as e:
        raise  (e,sys)

In [62]:
@dataclass 
class Data_Ingestion_Artifact:
    train_data_path:str
    test_data_path:str 

from sklearn.model_selection import train_test_split

class Data_Ingestion:
    def __init__(self,ingestion_config= Data_ingestion_Config):
        self.ingestion_config = ingestion_config
        
    def get_feature_extract_data(self):
        data = NetworkData()
        dataframe = data.get_dataframe(connection=self.ingestion_config.data_ingestion_collection_path)
        feature_data_path = self.ingestion_config.data_ingestion_feature_path

        os.makedirs(os.path.dirname(feature_data_path),exist_ok=True)
        dataframe.to_csv(feature_data_path,index=False,header=True)
        return dataframe 
    
    def split_data(self,dataframe):
        train_data,test_data = train_test_split(dataframe,test_size=self.ingestion_config.split_ratio)

        train_file_path = self.ingestion_config.train_data_path
        os.makedirs(os.path.dirname(train_file_path),exist_ok=True)
        train_data.to_csv(train_file_path,index=False,header=True)

        test_file_path = self.ingestion_config.test_data_path
        os.makedirs(os.path.dirname(test_file_path),exist_ok=True)
        test_data.to_csv(test_file_path,index=False,header=True)
        return train_data,test_data
    
    def init_data_ingestion(self):
        dataframe = self.get_feature_extract_data()
        print(dataframe.head())
        self.split_data(dataframe)

        data_ingestion_artifact = Data_Ingestion_Artifact(
            train_file_path=self.ingestion_config.train_data_path,
            test_file_path=self.ingestion_config.test_data_path
        )
        return data_ingestion_artifact
        


In [None]:
class Training_Pipeline:
    def __init__(self):
        self.data_ingestion_config = Data_ingestion_Config()

    def start_data_ingestion(self)->Data_Ingestion_Artifact:
        data_ingestion = Data_Ingestion(ingestion_config=self.data_ingestion_config)
        data_ingestion_artifact = data_ingestion.init_data_ingestion()
        return data_ingestion_artifact

    def run_pipeline(self)->None:
        data_ingestion_artifact = self.start_data_ingestion()

        return None


In [None]:
if __name__ == '__main__':
    try:
        data_ingestion = Training_Pipeline()
        data_ingestion_artifact = data_ingestion.run_pipeline()
        
    except Exception as e:
        raise e