In [1]:
import os
os.chdir('../')
%pwd

'e:\\Github repositories\\end-to-end-fake-news-detection'

In [6]:
from pathlib import Path
from dataclasses import dataclass

@dataclass
class PreprocessingConfig:
    root_dir: Path
    raw_data_path: Path
    train_data_path: Path
    test_data_path: Path
    status_file_path: Path

In [8]:
from FakeNewsDetection.utils.common import read_yaml, create_directories
from FakeNewsDetection.constants import *

class ConfigurationManager:
    def __init__(self, 
                 config_path= CONFIG_FILE,
                 parama_path= PARAMS_FILE,
                 schema_path= SCHEMA_FILE):
        
        self.config = read_yaml(config_path)
        self.params = read_yaml(parama_path)
        self.schema = read_yaml(schema_path)

        create_directories([self.config.artifact_root])
    
    def get_data_preproconfig(self):
        config = self.config.data_preprocessing
        create_directories([config.root_dir])

        return PreprocessingConfig(
            root_dir= Path(config.root_dir),
            raw_data_path= Path(config.raw_data_path),
            train_data_path= Path(config.train_data_path),
            test_data_path= Path(config.test_data_path),
            status_file_path= Path(config.status_file_path)
        )

In [11]:
import os
from FakeNewsDetection import logger
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


class DataPreprocessing:
    def __init__(self, config: PreprocessingConfig):
        self.config = config
    
    def preprocess_data(self):
        try:
            df = pd.read_csv(self.config.raw_data_path)
            logger.info(f"Data read successfully from {self.config.raw_data_path}")
            # drop duplicates
            df.drop_duplicates(inplace=True)
            logger.info("Duplicates dropped")
            # drop rows with missing values
            df.dropna(subset=['title', 'text'], inplace=True, axis=0)
            logger.info("Rows with missing values dropped")
            # merge title and text columns
            df['text'] = df['title'] + " " + df['text']
            # drop other columns except title and label
            df.drop(columns=['title', 'subject', 'date'], inplace=True)
            logger.info("Columns dropped")
            # keep only alphabets and numbers
            df['text'] = df['text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
            
            X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.05, random_state=42)
            logger.info("Train test split done")
            # save train and test data
            train_data = pd.concat([X_train, y_train], axis=1)
            test_data = pd.concat([X_test, y_test], axis=1)

            train_data.to_csv(self.config.train_data_path, index=False)
            test_data.to_csv(self.config.test_data_path, index=False)
            logger.info("Train and test data saved")
            
        except Exception as e:
            logger.error(f"Data preprocessing failed: {e}")
            raise e
            

In [12]:
# start pipelin
if __name__ == "__main__":
    try:
        config_mgr = ConfigurationManager()
        data_preproconfig = config_mgr.get_data_preproconfig()
        data_preprocessor = DataPreprocessing(data_preproconfig)
        data_preprocessor.preprocess_data()
    except Exception as e:
        logger.error(f"Failed to preprocess data: {e}")
        raise e

[2024-11-21 14:30:48,606] [INFO] [common.py:26] [Loaded yaml file from config\config.yaml]
[2024-11-21 14:30:48,609] [INFO] [common.py:26] [Loaded yaml file from params.yaml]
[2024-11-21 14:30:48,614] [INFO] [common.py:26] [Loaded yaml file from schema.yaml]
[2024-11-21 14:30:48,617] [INFO] [common.py:48] [created directory at: artifacts]
[2024-11-21 14:30:48,619] [INFO] [common.py:48] [created directory at: artifacts/data_preprocessing]
[2024-11-21 14:30:50,709] [INFO] [542353865.py:15] [Data read successfully from artifacts\data_ingestion\data.csv]
[2024-11-21 14:30:51,200] [INFO] [542353865.py:18] [Duplicates dropped]
[2024-11-21 14:30:51,659] [INFO] [542353865.py:21] [Rows with missing values dropped]
[2024-11-21 14:30:52,091] [INFO] [542353865.py:26] [Columns dropped]
[2024-11-21 14:30:54,687] [INFO] [542353865.py:31] [Train test split done]
[2024-11-21 14:31:03,574] [INFO] [542353865.py:38] [Train and test data saved]
