In [3]:
import os
os.chdir("../")

In [4]:
os.getcwd()

'c:\\TheBritishCollege\\DataScience\\CreditCard_fault_detection'

In [29]:
from src.Credit_card_project import logger
from src.Credit_card_project.constant import *

In [30]:
from dataclasses import dataclass
@dataclass(frozen=True)
class ModelTrainerConfig: 
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    final_model: Path

In [36]:
from src.Credit_card_project.utils.common import read_yaml, create_directories, save_binaryFile

In [37]:
class ConfigurationManager: 
    def __init__(self,config_file_path= CONFIG_FILE_PATH,
                 params_file_path= PARAMS_FILE_PATH,
                 schema_file_path=SCHEMA_FILE_PATH
                 ):
        self.config=read_yaml(config_file_path)
        self.sparams=read_yaml(params_file_path)
        self.schema=read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])
        
    def get_model_trainer_config(self)-> ModelTrainerConfig: 
        
        config=self.config.model_trainer
        schema=self.schema.COLUMNS

        create_directories([config.root_dir])
        
        model_train_config=ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            final_model=config.Final_model)
        
        return model_train_config
    

In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, mean_absolute_error,accuracy_score
from sklearn.model_selection import GridSearchCV

In [39]:
class ModelTrainer: 
    def __init__(self, config: ModelTrainerConfig): 
        self.config=config
        logger.info(f"final model : {config.final_model}")
        self.Model={
            'Random_Forest': RandomForestClassifier(),
            'Decision_Tree': DecisionTreeClassifier(),
            'NaiveBayes': GaussianNB()
            }  
                
          
    def Model_evaluation(self, Model, x_train, y_train, x_test, y_test): 
        report={}
        mean_error={}
        
        for mod in range(len(Model)): 
            
            model=list(Model.values())[mod]
            model.fit(x_train, y_train)
            
            logger.info(f"Model fitted")
            
            y_pred=model.predict(x_test)
            
            score=accuracy_score(y_test, y_pred)
            mqe=mean_squared_error(y_test, y_pred)
            
            logger.info(f"accuracy score {score} and mean sq error {mqe}")
            
            report[list(Model.keys())[mod]]=score
            mean_error[list(Model.keys())[mod]]=mqe
        
        return report, mean_error
    
    def fine_tune_model(self, best_model: object, x_train, y_train): 
        # 'n_estimators': Number of trees in the forest.
        # max_depth: Definition: Maximum depth of the individual trees.
        # min_samples_split:  Minimum number of samples required to split an internal node.
        # min_samples_leaf: Minimum number of samples required to be at a leaf node.
        # max_features:  The number of features to consider when looking for the best split.
        params={
            'n_estimators': [50, 100],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'max_features':  ['auto', None]
            
        }
        y_train=np.array(y_train).ravel()
        
        clf=GridSearchCV(best_model, param_grid=params, cv=5, verbose=3)
        clf.fit(x_train, y_train)
        
        best_param=clf.best_params_
        
        logger.info(f"best params :{best_param}")
        
        fine_tune_model=best_model.set_params(**best_param)
        
        return fine_tune_model
        
        
        
             
    def initiate_model_trainer(self): 
        train_path=self.config.train_data_path
        test_path=self.config.test_data_path
        
        ## Loading model path: 
        x=pd.read_csv(train_path)
        y=pd.read_csv(test_path)
        
        # logger.info(f"x_train_ data {x.head()}")
        # logger.info(f"x_test data {y.head()}")   
        
        x_train=x.iloc[:,:-1]
        y_train=x.iloc[:,[-1]]
        x_test=y.iloc[:,:-1]  
        y_test=y.iloc[:,[-1]]
        
        # logger.info(f"x_train_data: \n\n {x_train.shape}")
        # logger.info(f"x_test data:  \n\n {y_train.shape}")
        # logger.info(f"x_test data:  \n\n {x_test.shape}")
        # logger.info(f"x_test data:  \n\n {y_test.shape}")
        
        Model_acc_report, mean_sq_error=self.Model_evaluation(self.Model, x_train, y_train, x_test, y_test)
        
        logger.info(f"Model accuracy report: {Model_acc_report}\n\n")
        logger.info(f"Mean squared error: {mean_sq_error}")
        
        best_model_score=max(sorted(Model_acc_report.values()))
        
        logger.info(f"Best model accuracy {best_model_score}")
        
        best_model_name=list((Model_acc_report.keys()))[list(Model_acc_report.values()).index(best_model_score)]
        
        logger.info(f"Best model Name: {best_model_name}")
        
        best_model_obj=self.Model[best_model_name]
        logger.info(f"best Model {best_model_obj}")
        
        # Fine tuning best model
        Good_model=self.fine_tune_model(best_model_obj, x_train, y_train)
        
        logger.info(f"best model: {Good_model}")
        
        # save_binaryFile(path=Path(self.config.final_model), data=Good_model)
        
        
        y_pre=Good_model.predict(x_test)
        
        score=accuracy_score(y_test, y_pre)
        logger.info(f"After fine tune score {score}")
        
        save_binaryFile(path=Path(self.config.final_model), data=Good_model)
        logger.info(f"Save model sucessful")
        
        
        
        
        

In [40]:
configmanager=ConfigurationManager()
get_model_config=configmanager.get_model_trainer_config()
model_trainer=ModelTrainer(get_model_config)
model_trainer.initiate_model_trainer()

[2024-02-04 22:25:50,475: INFO, common : Yaml file read config\config.yaml successfully]
[2024-02-04 22:25:50,483: INFO, common : Yaml file read params.yaml successfully]
[2024-02-04 22:25:50,495: INFO, common : Yaml file read schema.yaml successfully]
[2024-02-04 22:25:50,501: INFO, common : Directories created ['artifacts']]
[2024-02-04 22:25:50,507: INFO, common : Directories created ['artifacts/model_trainer']]
[2024-02-04 22:25:50,512: INFO, 3960069360 : final model : artifacts/data_transformation/model.pkl]


  return fit_method(estimator, *args, **kwargs)


[2024-02-04 22:25:51,619: INFO, 3960069360 : Model fitted]
[2024-02-04 22:25:51,732: INFO, 3960069360 : accuracy score 0.8366533864541833 and mean sq error 0.16334661354581673]
[2024-02-04 22:25:51,758: INFO, 3960069360 : Model fitted]
[2024-02-04 22:25:51,776: INFO, 3960069360 : accuracy score 0.7569721115537849 and mean sq error 0.24302788844621515]
[2024-02-04 22:25:51,790: INFO, 3960069360 : Model fitted]
[2024-02-04 22:25:51,901: INFO, 3960069360 : accuracy score 0.33067729083665337 and mean sq error 0.6693227091633466]
[2024-02-04 22:25:51,904: INFO, 3960069360 : Model accuracy report: {'Random_Forest': 0.8366533864541833, 'Decision_Tree': 0.7569721115537849, 'NaiveBayes': 0.33067729083665337}

]
[2024-02-04 22:25:51,908: INFO, 3960069360 : Mean squared error: {'Random_Forest': 0.16334661354581673, 'Decision_Tree': 0.24302788844621515, 'NaiveBayes': 0.6693227091633466}]
[2024-02-04 22:25:51,913: INFO, 3960069360 : Best model accuracy 0.8366533864541833]
[2024-02-04 22:25:51,916: 

  y = column_or_1d(y, warn=True)


[CV 3/5] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=nan total time=   0.0s
[CV 2/5] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=nan total time=   0.0s
[CV 3/5] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=nan total time=   0.0s
[CV 4/5] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=nan total time=   0.0s
[CV 5/5] END max_depth=None, max_features=aut

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "c:\TheBritishCollege\DataScience\CreditCard_fault_detection\credit_env\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\TheBritishCollege\DataScience\CreditCard_fault_detection\credit_env\lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "c:\TheBritishCollege\DataScience\CreditCard_fault_detection\credit_env\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\TheBr

[2024-02-04 22:27:33,583: INFO, 3960069360 : best params :{'max_depth': 20, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}]
[2024-02-04 22:27:33,586: INFO, 3960069360 : best model: RandomForestClassifier(max_depth=20, max_features=None, min_samples_leaf=2)]
[2024-02-04 22:27:33,614: INFO, 3960069360 : After fine tune score 0.8366533864541833]


TypeError: isinstance() arg 2 must be a type or tuple of types

In [62]:
Model_accuracy_report={'Random_Forest': 0.8406374501992032, 'Decision_Tree': 0.7609561752988048, 'NaiveBayes': 0.33067729083665337}


In [68]:
best_model_score=max(sorted(Model_accuracy_report.values()))

In [70]:
list((Model_accuracy_report.keys()))[list(Model_accuracy_report.values()).index(best_model_score)]

'Random_Forest'