In [1]:
import os
os.chdir("../")
os.getcwd()

'c:\\Users\\arunk\\FraudDetection'

In [2]:
import sys

from fraudDetection.components import DataIngestion, DataValidation, DataTransformation, ModelTrainer, ModelEvaluation, \
    ModelPusher
from fraudDetection.config.configuration import ConfigurationManager
from fraudDetection.constants import CONFIG_FILE_PATH
from fraudDetection.exception import FraudDetectionException
from fraudDetection.logger import logging

config = ConfigurationManager(CONFIG_FILE_PATH)
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion_artifacts = DataIngestion(data_ingestion_config).initiate_data_ingestion()
data_validation_config = config.get_data_validation_config()
data_validation_artifacts = DataValidation(data_ingestion_artifacts,
                                           data_validation_config).initiate_data_validation()
data_transformation_config = config.get_data_transformation_config()
data_transformation_artifacts = DataTransformation(data_validation_artifacts, data_ingestion_artifacts,
                                                           data_transformation_config).initiate_data_transformation()

====>>>File already exists
Do you want to download data again? Options (Yes/No): 
You selected: No


In [3]:
model_trainer_config = config.get_model_trainer_config()

model_trainer = ModelTrainer(model_trainer_config=model_trainer_config, data_transformation_artifact=data_transformation_artifacts)

In [4]:
from fraudDetection.utils import load_numpy_array_data, save_object
from fraudDetection.entity import MetricInfoArtifact, ModelFactory
from fraudDetection.entity import evaluate_classification_model

import sys
from pathlib import Path
from fraudDetection.exception import FraudDetectionException
from fraudDetection.logger import logging

In [5]:

transformed_train_file_path = Path(data_transformation_artifacts.transformed_train_file_path)
train_array = load_numpy_array_data(transformed_train_file_path)

transformed_test_file_path = Path(data_transformation_artifacts.transformed_test_file_path)
test_array = load_numpy_array_data(transformed_test_file_path)

X_train, y_train, X_test, y_test = train_array[:, :-1], train_array[:, -1], test_array[:, :-1], test_array[:, -1]

model_config_file_path = model_trainer_config.model_config_file_path

model_factory = ModelFactory(model_config_path=model_config_file_path)
base_score = model_trainer_config.base_score
threshold =model_trainer_config.threshold_diff_train_test_acc



In [6]:
best_model = model_factory.get_best_model(X=X_train, y=y_train, base_score=base_score)
best_model

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


GridSearchedBestModel(model_serial_number='module_0', model=LogisticRegression(class_weight='balanced', max_iter=1000), best_model=LogisticRegression(class_weight='balanced', max_iter=1000), best_parameters={'penalty': 'l2'}, best_score=0.9017267986874256)

In [7]:
grid_searched_best_model_list = model_factory.grid_search_best_model_list

In [8]:
grid_searched_best_model_list

[GridSearchedBestModel(model_serial_number='module_0', model=LogisticRegression(class_weight='balanced', max_iter=1000), best_model=LogisticRegression(class_weight='balanced', max_iter=1000), best_parameters={'penalty': 'l2'}, best_score=0.9017267986874256),
 GridSearchedBestModel(model_serial_number='module_1', model=RandomForestClassifier(min_samples_leaf=3), best_model=RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=3), best_parameters={'class_weight': 'balanced_subsample'}, best_score=0.9950191025270068),
 GridSearchedBestModel(model_serial_number='module_2', model=LinearSVC(class_weight={70: None, '0:1': None}, dual=False, random_state=42), best_model=LinearSVC(class_weight='balanced', dual=False, random_state=42), best_parameters={'class_weight': 'balanced'}, best_score=0.9009416865150637)]

In [9]:
model_list = [model.best_model for model in grid_searched_best_model_list]
model_list

[LogisticRegression(class_weight='balanced', max_iter=1000),
 RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=3),
 LinearSVC(class_weight='balanced', dual=False, random_state=42)]

In [10]:
model_list[0].__class__.__name__
model_list

[LogisticRegression(class_weight='balanced', max_iter=1000),
 RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=3),
 LinearSVC(class_weight='balanced', dual=False, random_state=42)]

In [25]:
import numpy as np 
import mlflow
from mlflow.entities import ViewType
from sklearn.metrics import f1_score, fbeta_score, roc_auc_score, precision_recall_curve, recall_score, \
    classification_report, precision_score, accuracy_score, make_scorer, auc

index_number = 0
base_score = 0.6
threshold=.1
initial_base_score = 0
metric_info_artifact = None
for model in model_list:
    model_name = str(model)
    print(f"Starting evaluating model:{type(model).__name__}{'<<' * 20} ")
    # getting prediction for training and testing dataset
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    # calculating f1 score on training and testing dataset
    index_number += 1
    train_f1_score = f1_score(y_train, y_train_pred)
    test_f1_score = f1_score(y_test, y_test_pred)
    # calculating f-beta-score on training and testing dataset
    train_fbeta_score = fbeta_score(y_train, y_train_pred, beta=2)
    test_fbeta_score = fbeta_score(y_test, y_test_pred, beta=2)
    # calculating f-betas-core on training and testing dataset
    train_roc_auc_score = roc_auc_score(y_train, y_train_pred)
    test_roc_auc_score = roc_auc_score(y_test, y_test_pred)
    # calculating precision  score for training and testing dataset
    train_precision_score = precision_score(y_train, y_train_pred)
    test_precision_score = precision_score(y_test, y_test_pred)
    # calculating recall score on training and testing dataset
    train_recall_score = recall_score(y_train, y_train_pred)
    test_recall_score = recall_score(y_test, y_test_pred)
    # getting classification report on training and testing dataset
    train_classification_report = classification_report(y_train, y_train_pred)
    test_classification_report = classification_report(y_test, y_test_pred)
    train_precision_recall_curve = precision_recall_curve(y_train, y_train_pred)
    test_precision_recall_curve = precision_recall_curve(y_test, y_test_pred)
    # Accuracy Score on train and test dataset 
    train_accuracy_score = accuracy_score(y_train, y_train_pred)
    test_accuracy_score = accuracy_score(y_test, y_test_pred)

    # calculating harmonic mean of train and test accuracy_score
    model_accuracy = (2 * train_accuracy_score * test_accuracy_score) / (
            train_accuracy_score + test_accuracy_score)
    diff_test_train_acc = abs(train_accuracy_score - test_accuracy_score)

    if train_fbeta_score >= base_score and diff_test_train_acc < threshold:
        base_score = train_fbeta_score
    
        metric_info_artifact = MetricInfoArtifact(
            model_name=model_name,
            model_object=model,
            train_f1_score=train_f1_score,
            test_f1_score=test_f1_score,
            train_fbeta_score=train_fbeta_score,
            test_fbeta_score=test_fbeta_score,
            train_roc_auc_score=train_roc_auc_score,
            test_roc_auc_score=test_roc_auc_score,
            train_precision_score=train_precision_score,
            test_precision_score=test_precision_score,
            train_recall_score=train_recall_score,
            test_recall_score=test_recall_score,
            model_accuracy=model_accuracy,
            train_accuracy_score=train_accuracy_score,
            test_accuracy_score=test_accuracy_score,
            model_index=index_number
        )
        mlflow.sklearn.log_model(sk_model=model,
                         artifact_path="model_training",
                         registered_model_name=model_name,
                         input_example=X_train[:1, :],
                         metadata=dict(stage="training",
                                       index_number=index_number))

        if initial_base_score == 0:
            initial_base_score = base_score
        query = f"params.base_score ='{initial_base_score}'"
        all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
        runs = mlflow.search_runs(
        experiment_ids=all_experiments,
        filter_string=query,
        run_view_type=ViewType.ACTIVE_ONLY,
        )
        if runs.empty:
            mlflow.log_params({"base_score": base_score, "threshold": threshold})

Starting evaluating model:LogisticRegression<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 
Starting evaluating model:RandomForestClassifier<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 


Registered model 'RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=3)' already exists. Creating a new version of this model...
2023/05/19 16:15:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=3), version 10
Created version '10' of model 'RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=3)'.


                             run_id experiment_id   status   
0  bc56be73d2d242ee847acb53891605ba             0  RUNNING  \

                                        artifact_uri   
0  file:///c:/Users/arunk/FraudDetection/mlruns/0...  \

                        start_time end_time  metrics.test_recall_score   
0 2023-05-19 09:49:10.928000+00:00     None                        1.0  \

   metrics.test_precision_score  metrics.train_f-beta_score   
0                      0.002346                    0.910794  \

   metrics.train_roc_auc_score  ...  metrics.train_f1_score   
0                     0.929116  ...                0.901369  \

   metrics.test_f-beta_score  metrics.test_roc_auc_score  params.threshold   
0                   0.011619                    0.934937               0.1  \

    params.base_score  tags.mlflow.user   
0  0.9993508276946894         arunkhare  \

                             tags.mlflow.source.name  tags.mlflow.source.type   
0  c:\Users\arunk\anaconda3\envs\F

In [29]:
mlflow.get_artifact_uri()

False

In [13]:

mlflow.log_metric("train_f1_score", train_f1_score)
mlflow.log_metric("test_f1_score", test_f1_score)
mlflow.log_metric("train_f-beta_score", train_fbeta_score)
mlflow.log_metric("test_f-beta_score", test_fbeta_score)
mlflow.log_metric("train_roc_auc_score", train_roc_auc_score)
mlflow.log_metric("test_roc_auc_score", test_roc_auc_score)
mlflow.log_metric("train_precision_score", train_precision_score)
mlflow.log_metric("test_precision_score", test_precision_score)
mlflow.log_metric("train_recall_score", train_recall_score)
mlflow.log_metric("test_recall_score", test_recall_score)
mlflow.log_metric("model_accuracy", model_accuracy)
mlflow.log_metric("train_accuracy_score", train_accuracy_score)
mlflow.log_metric("test_accuracy_score", test_accuracy_score)

In [8]:
import yaml
from pathlib import Path

path = Path(r"C:\Users\arunk\FraudDetection\artifacts\model_evaluation\model_evaluation.yaml")
with open (file=path, mode='r') as f:
    content = yaml.load(f,Loader=yaml.Loader)

In [9]:
content

{'best_model': {'model_path': WindowsPath('C:/Users/arunk/FraudDetection/artifacts/model_trainer/2023-05-19-18-38-47/trained_model/model.pkl')}, 'history': {'2023-05-19-18-38-47': {'model_path': 'C:\\Users\\arunk\\FraudDetection\\artifacts\\model_trainer\\2023-05-19-18-38-47\\trained_model\\model.pkl'}}}

In [15]:
data.params

{}

In [16]:
len(data.tags)

4