### Model Selection

In [25]:
import os

In [26]:
%pwd

'd:\\Silent_Night\\mlops\\AML_Classification'

In [29]:
os.chdir("AML_Classification")

In [30]:
%pwd

'd:\\Silent_Night\\mlops\\AML_Classification'

In [31]:
import pandas as pd
df=pd.read_csv("D:/data/classification_data/stress_data/stress_300.csv")
print(df.head())

   Humidity  Temperature  Step count  Stress Level
0     21.33        90.33         123             1
1     21.41        90.41          93             1
2     27.12        96.12         196             2
3     27.64        96.64         177             2
4     10.87        79.87          87             0


In [40]:
#Update entity(Specifiy the data type all the config and params yaml file variable)
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class PrepareBaseModelConfig:
    root_dir: Path
    data_dir: Path
    base_model_path: Path
    params_n_estimators: list
    params_max_depth: list
    params_min_sample_split: list
    params_min_sample_leaf: list
    params_max_features: list
    params_class_weight: str

In [41]:
from AML_Classifier.constants.__init__ import CONFIG_FILE_PATH,PARAMS_FILE_PATH
from AML_Classifier.utils.common import read_yaml, create_directories
from pathlib import Path

In [42]:
#Update the configration manager
from AML_Classifier.constants.__init__ import CONFIG_FILE_PATH,PARAMS_FILE_PATH
from AML_Classifier.utils.common import read_yaml, create_directories
from pathlib import Path

class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH,params_filepath = PARAMS_FILE_PATH):
        self.config= read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_prepare_base_model_config(self) -> PrepareBaseModelConfig:
        config = self.config.prepare_base_model
        training_data = Path(self.config.data_prepocess.save_processed_data)

        create_directories([config.root_dir])

        prepare_base_model_config = PrepareBaseModelConfig(
            root_dir = Path(config.root_dir),
            base_model_path = Path(config.base_model_path),
            data_dir=Path(training_data),
            params_n_estimators = self.params.n_estimators,
            params_max_depth = self.params.max_depth,
            params_min_sample_split = self.params.min_samples_split,
            params_min_sample_leaf = self.params.min_samples_leaf,
            params_max_features = self.params.max_features,
            params_class_weight = self.params.class_weight
        )

        return prepare_base_model_config

In [35]:
#Update the compontents
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
import os
import pickle
import numpy as np

In [46]:
class PrepareBaseModel:
    def __init__(self,config: PrepareBaseModelConfig):
        self.config = config

    def load_model_data(self):
        #Need to make this data ingestion dynamic
        data=pd.read_csv(Path(self.config.data_dir))
        y=data[["is_laundering"]]
        print(y.isnull().sum())
        x=data.drop(["is_laundering"],axis=1)
        y=np.array(y)
        y=y.ravel()

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
        print(x.isnull().sum())
        

        # Define the parameter grid
        param_grid = {
            'n_estimators': self.config.params_n_estimators,
            'max_depth': self.config.params_max_depth,
            'min_samples_split': self.config.params_min_sample_split,
            'min_samples_leaf': self.config.params_min_sample_leaf,
            'max_features': self.config.params_max_features
            }
        
        # Initialize the classifier
        model = RandomForestClassifier()

        # Initialize GridSearchCV(HYPERPARAMETER TUNING)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2)
        # Fit the model
        grid_search.fit(X_train, y_train)
        # Print the best parameters
        print("Best Parameters:", grid_search.best_params_)

        # Evaluate the best model on the test set
        tuned_model=RandomForestClassifier(**grid_search.best_params_)
        tuned_model.fit(X_train,y_train)

        #Save the ML model
        with open(self.config.base_model_path,"wb") as file:
            pickle.dump(tuned_model, file)

        
        # Prediction Value
        y_pred=tuned_model.predict(X_test)
        #eval accuracy
        accuracy=metrics.accuracy_score(y_pred,y_test)
        print("Test Set Accuracy:", accuracy)
        

In [47]:
#pipeline
try:
    config = ConfigurationManager()
    prepare_base_model_config=config.get_prepare_base_model_config()
    prepare_base_model= PrepareBaseModel(config=prepare_base_model_config)
    prepare_base_model.load_model_data()
except Exception as e:
    raise e

[2024-06-29 10:00:27,257: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-29 10:00:27,276: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-29 10:00:27,281: INFO: common: created directory at: artifacts]


[2024-06-29 10:00:27,287: INFO: common: created directory at: artifacts/prepare_base_model]
is_laundering    0
dtype: int64
receiving_currency    0
Payment_Currency      0
Payment Format        0
from_bank             0
to_bank               0
amount_received       0
amount_paid           0
dtype: int64
Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Test Set Accuracy: 0.8895663956639567
