In [1]:
import os
os.chdir('../')

In [3]:
pwd

'e:\\Github repositories\\end-to-end-fake-news-detection'

In [15]:
from pathlib import Path
from dataclasses import dataclass

@dataclass
class EvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_path: Path
    evaluation_metrics_path: Path

In [16]:
from FakeNewsDetection.utils.common import read_yaml, create_directories
from FakeNewsDetection.constants import *

class ConfigurationManager:
    def __init__(self, 
                 config_path= CONFIG_FILE,
                 parama_path= PARAMS_FILE,
                 schema_path= SCHEMA_FILE):
        
        self.config = read_yaml(config_path)
        self.params = read_yaml(parama_path)
        self.schema = read_yaml(schema_path)

        create_directories([self.config.artifact_root])

    def get_evaluation_config(self) -> EvaluationConfig:
        config = self.config.evaluation
        create_directories([config.root_dir])

        return EvaluationConfig(
            root_dir = Path(config.root_dir),
            test_data_path = Path(config.test_data_path),
            model_path = Path(config.model_path),
            evaluation_metrics_path = Path(config.evaluation_metrics_path)
        )

In [17]:
import os
from FakeNewsDetection import logger
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import json



class Evaluation:
    def __init__(self, config: EvaluationConfig):
        self.config = config

    def evaluate(self):
        try:
            # load model
            model = joblib.load(self.config.model_path)
            logger.info('Model loaded successfully')
            # load test data
            test_data = pd.read_csv(self.config.test_data_path)
            logger.info('Test data loaded successfully')

            # vectorize data
            tfidf_vectorizer = TfidfVectorizer(max_features=200, stop_words='english')
            X = tfidf_vectorizer.fit_transform(test_data['text'])
            y = test_data['label']
            logger.info('Data vectorized successfully')
            # free memory
            del test_data

            # predict
            y_pred = model.predict(X)
            # calculate metrics
            accuracy = accuracy_score(y, y_pred)
            precision = precision_score(y, y_pred)
            recall = recall_score(y, y_pred)
            f1 = f1_score(y, y_pred)
            logger.info('Metrics calculated successfully')

            # save metrics in evaluation_metrics_path in json fromat
            metrics = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
            with open(self.config.evaluation_metrics_path, 'w') as f:
                json.dump(metrics, f)
            logger.info('Metrics saved successfully')

        except Exception as e:
            logger.error(f'Error in evaluation: {e}')
            raise e



In [18]:
# start evaluation
config_manager = ConfigurationManager()
evaluation_config = config_manager.get_evaluation_config()
evaluation = Evaluation(evaluation_config)
evaluation.evaluate()

[2024-11-21 17:51:51,407] [INFO] [common.py:26] [Loaded yaml file from config\config.yaml]
[2024-11-21 17:51:51,407] [INFO] [common.py:26] [Loaded yaml file from params.yaml]
[2024-11-21 17:51:51,426] [INFO] [common.py:26] [Loaded yaml file from schema.yaml]
[2024-11-21 17:51:51,428] [INFO] [common.py:48] [created directory at: artifacts]
[2024-11-21 17:51:51,430] [INFO] [common.py:48] [created directory at: artifacts/evaluation]
[2024-11-21 17:51:51,448] [INFO] [4081662381.py:19] [Model loaded successfully]
[2024-11-21 17:51:51,688] [INFO] [4081662381.py:22] [Test data loaded successfully]
[2024-11-21 17:51:52,523] [INFO] [4081662381.py:28] [Data vectorized successfully]
[2024-11-21 17:51:52,540] [INFO] [4081662381.py:39] [Metrics calculated successfully]
[2024-11-21 17:51:52,540] [INFO] [4081662381.py:50] [Metrics saved successfully]


In [14]:
temp = pd.read_csv('artifacts/data_ingestion/data.csv')

# shuffle the rows of dataframe
temp = temp.sample(frac=1).reset_index(drop=True)
# save to save place with same name
temp.to_csv('artifacts/data_ingestion/data.csv', index=False)