In [1]:
import os
os.chdir('../')

In [2]:
pwd

'e:\\Github repositories\\end-to-end-fake-news-detection'

In [3]:
from pathlib import Path
from dataclasses import dataclass

@dataclass
class TrainingConfig:
    root_dir: Path
    train_data_path: Path
    model_path: Path


In [4]:
from FakeNewsDetection.utils.common import read_yaml, create_directories
from FakeNewsDetection.constants import *

class ConfigurationManager:
    def __init__(self, 
                 config_path= CONFIG_FILE,
                 parama_path= PARAMS_FILE,
                 schema_path= SCHEMA_FILE):
        
        self.config = read_yaml(config_path)
        self.params = read_yaml(parama_path)
        self.schema = read_yaml(schema_path)

        create_directories([self.config.artifact_root])

    def get_training_config(self) -> TrainingConfig:
        config = self.config.training
        create_directories([config.root_dir])

        return TrainingConfig(
            root_dir = Path(config.root_dir),
            train_data_path = Path(config.train_data_path),
            model_path = Path(config.model_path)
        )

In [5]:
import os
from FakeNewsDetection import logger
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    def train(self):
        try:
            # load training data
            train_data = pd.read_csv(self.config.train_data_path)
            logger.info(f"Training data loaded from {self.config.train_data_path}")
            # vectorize it
            vectorizer = TfidfVectorizer(max_features=200)
            X = vectorizer.fit_transform(train_data['text']).toarray()
            y = train_data['label'].values
            # remove train data from memory
            del train_data

            # create logidtic regression model
            model = LogisticRegression()
            model.fit(X, y)
            logger.info("Model trained successfully")
            # save model in pickle, path is in config
            joblib.dump(model, self.config.model_path)
            logger.info(f"Model saved at {self.config.model_path}")
        except Exception as e:
            logger.error(f"Training failed: {str(e)}")
            raise e

In [6]:
# start pupline
config_manager = ConfigurationManager() 
training_config = config_manager.get_training_config()
trainer = Training(training_config)
trainer.train()

[2024-11-21 14:58:14,329] [INFO] [common.py:26] [Loaded yaml file from config\config.yaml]
[2024-11-21 14:58:14,332] [INFO] [common.py:26] [Loaded yaml file from params.yaml]
[2024-11-21 14:58:14,337] [INFO] [common.py:26] [Loaded yaml file from schema.yaml]
[2024-11-21 14:58:14,340] [INFO] [common.py:48] [created directory at: artifacts]
[2024-11-21 14:58:14,343] [INFO] [common.py:48] [created directory at: artifacts/training]
[2024-11-21 14:58:16,191] [INFO] [2081724299.py:17] [Training data loaded from artifacts\data_preprocessing\train.csv]
[2024-11-21 14:58:34,207] [INFO] [2081724299.py:28] [Model trained successfully]
[2024-11-21 14:58:34,229] [INFO] [2081724299.py:31] [Model saved at artifacts\training\model.pkl]
