In [1]:
import os
os.chdir("../")
!pwd

/home/aditya/network_security


In [15]:
from pathlib import Path
from pydantic import BaseModel,HttpUrl

class ModelTrainerConfig(BaseModel):
    ## from config
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
  
    mlflow_uri: HttpUrl
    mlflow_experiment: str
    standard_scaler_name: str
    
    ## from params
    models: list
    hyperparams: dict

    ## from schema 
    target_column: str

In [16]:
from src.NetworkSecurity.constants import *
from src.NetworkSecurity.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_model_trainer_config(self)->ModelTrainerConfig:

        config = self.config.model_trainer
        params = self.params.model_trainer
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            mlflow_uri = config.mlflow_uri,
            mlflow_experiment = config.mlflow_experiment,
            standard_scaler_name = config.standard_scaler_name,

            models = list(params.model.keys()),
            hyperparams = params.model.to_dict(),
            
            target_column = schema.name           
        )

        return model_trainer_config

In [17]:
cm = ConfigurationManager()
model_trainer_config = cm.get_model_trainer_config()
model_trainer_config

[2025-03-31 21:20:40,652: INFO: common : Yaml File: config/config.yaml loaded successfully]
[2025-03-31 21:20:40,656: INFO: common : Yaml File: params.yaml loaded successfully]
[2025-03-31 21:20:40,663: INFO: common : Yaml File: schema.yaml loaded successfully]
[2025-03-31 21:20:40,664: INFO: common : created directory at: artifacts]
[2025-03-31 21:20:40,665: INFO: common : created directory at: artifacts/model_trainer]


ModelTrainerConfig(root_dir=PosixPath('artifacts/model_trainer'), train_data_path=PosixPath('artifacts/data_transformation/train.csv'), test_data_path=PosixPath('artifacts/data_transformation/test.csv'), mlflow_uri=HttpUrl('https://dagshub.com/adityaav80/E2E-Network-Security.mlflow'), mlflow_experiment='Model_training', standard_scaler_name='ss.pkl', models=['LogisticRegression'], hyperparams={'LogisticRegression': {'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'], 'C': [0.01, 0.1, 1]}}, target_column='CLASS_LABEL')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.NetworkSecurity.logging.logger import logger
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import mlflow
from sklearn.preprocessing import StandardScaler
import pickle
from urllib.parse import urlparse
from mlflow.models import infer_signature
from sklearn.model_selection import GridSearchCV
from dotenv import load_dotenv
import datetime

load_dotenv()

class ModelTrainer:
    def __init__(self,config: ModelTrainerConfig):
        self.config = config
        os.environ["MLFLOW_TRACKING_USERNAME"] = "ADITYAAV80" 
        os.environ["MLFLOW_TRACKING_PASSWORD"] = "7c7d5f1b5994f5230784a07f1f9573168c40c053"
    
    def train(self):

        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Split features & target
        x_train = train_data.drop(columns=[self.config.target_column], axis=1)
        y_train = train_data[self.config.target_column]
        x_test = test_data.drop(columns=[self.config.target_column], axis=1)
        y_test = test_data[self.config.target_column]

        model_mapping = {
            "LogisticRegression": LogisticRegression(),
            "DecisionTreeClassifier": DecisionTreeClassifier(),
            "GradientBoostingClassifier": GradientBoostingClassifier(),
            "AdaBoostClassifier": AdaBoostClassifier(),
            "KNeighborsClassifier": KNeighborsClassifier(),
            "SVC": SVC(),
            "XGBClassifier": XGBClassifier(),
        }

        # Standardize features
        ss = StandardScaler()
        x_train = ss.fit_transform(x_train)

        # Standardize test features
        x_test = ss.transform(x_test)

        scaler_path = os.path.join(self.config.root_dir, self.config.standard_scaler_name)
        with open(scaler_path, "wb") as f:
            pickle.dump(ss, f)
            

        mlflow.set_tracking_uri(str(self.config.mlflow_uri))  # For tracking runs
        mlflow.set_registry_uri(str(self.config.mlflow_uri))  # For model registry
        mlflow.set_experiment(self.config.mlflow_experiment)  # Set the experiment


        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        signature = infer_signature(x_train, y_train)

        models = self.config.models
        hyperparams = self.config.hyperparams

        best_model = None
        best_score = -float("inf")
        best_run_id = None

        for model_name in models:
            model = model_mapping[model_name]
            params = hyperparams.get(model_name, {})

            grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring="accuracy", n_jobs=-1)
            grid_search.fit(x_train, y_train)

            for param, score in zip(grid_search.cv_results_["params"], grid_search.cv_results_["mean_test_score"]):
                with mlflow.start_run() as run:
                    print(f"{param}: {score:.4f}")

                    mlflow.log_params(param)  
                    mlflow.log_metric("train_accuracy", f"{score:.4f}")
                    mlflow.log_param("model_name", model_name)

                    # Track best model based on training accuracy (since GridSearchCV only works on train)
                    if score > best_score:  
                        best_score = score  
                        best_model = grid_search.best_estimator_  
                        best_param = param 
                        best_model_name = model_name 
                        best_run_id = run.info.run_id  # Store the run ID

        #  Now test the best model on the test set
        test_acc = best_model.score(x_test, y_test)
        
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        model_filename = f"{self.config.root_dir}/{best_model_name}_{timestamp}.pkl"

        # Save the best model
        with open(model_filename, "wb") as f:
            pickle.dump(best_model, f)

        # Save metadata (best parameters)
        metadata_filename = f"{self.config.root_dir}/{best_model_name}_{timestamp}_metadata.txt"
        with open(metadata_filename, "w") as f:
            f.write(f"Model: {best_model_name}\n")
            f.write(f"Best Params: {best_param}\n")
            f.write(f"Train Accuracy: {best_score}\n")

        # Log the best model based on test accuracy
        with mlflow.start_run():
            mlflow.log_params(best_param)
            mlflow.log_metric("test_accuracy", f"{test_acc:.4f}")  # Now test accuracy is logged
            mlflow.log_param("model_name",best_model_name)

            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(best_model, "model", registered_model_name="Best Model", signature=signature)
            else:
                mlflow.sklearn.log_model(best_model, "model", signature=signature)
        
        return best_run_id

In [19]:
mt = ModelTrainer(model_trainer_config)
run_id = mt.train()
print(run_id)



{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}: 0.9264
🏃 View run rare-elk-427 at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3/runs/5b17947f6b064f0f8bf5e943a188e7c5
🧪 View experiment at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3
{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}: 0.9275
🏃 View run blushing-trout-386 at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3/runs/d6076970fd604491ad5d1912f9f7b11a
🧪 View experiment at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3
{'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}: 0.9355
🏃 View run peaceful-roo-108 at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3/runs/1b41ac064a0e4eeba5f52334fccde7d0
🧪 View experiment at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3
{'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}: 0.9355
🏃 View run adventurous-colt-569 at: https://da

Registered model 'Best Model' already exists. Creating a new version of this model...
2025/03/31 21:22:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Model, version 6
Created version '6' of model 'Best Model'.


🏃 View run funny-koi-577 at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3/runs/2d644d0e5b024e9fa2deb964e4cf0b5a
🧪 View experiment at: https://dagshub.com/adityaav80/E2E-Network-Security.mlflow/#/experiments/3
496077d2d57647449a991e4225abd986
