In [1]:
import os

In [2]:
%pwd

'c:\\Users\\LENOVO\\OneDrive\\Desktop\\Miraffra\\Internship-Mlops-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\LENOVO\\OneDrive\\Desktop\\Miraffra\\Internship-Mlops-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [8]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd

from collections import Counter
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

from tqdm.auto import tqdm


from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    # I am only adding train_test_spliting cz this data is already cleaned up


    def train_test_spliting(self):

    

        data = pd.read_csv(self.config.data_path)

        all_text = ' '.join(data['text'].values)
        all_text = re.sub(r'http\S+', '', all_text)
        all_text = re.sub(r'@\S+', '', all_text)
        all_text = re.sub(r'#\S+', '', all_text)

        words = all_text.split()
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if not word in stop_words]

        word_counts = Counter(words)
        top_words = word_counts.most_common(100)

        def clean_text(text):
            text = re.sub('<.*?>', '', text)

            text = re.sub('[^a-zA-Z]', ' ', text).lower()
            words = nltk.word_tokenize(text)
            words = [w for w in words if w not in stopwords.words('english')]
            stemmer = PorterStemmer()
            words = [stemmer.stem(w) for w in words]
            text = ' '.join(words)
            return text

        nltk.download('punkt')

        tqdm.pandas()

        data['cleaned_text'] = data['text'].progress_apply(clean_text)

        cv = CountVectorizer(max_features=5000)
        X = cv.fit_transform(data['cleaned_text']).toarray()
        y = data['spam']

        # Split the data into training and test sets. (0.75, 0.25) split.
        # train, test = train_test_split(data)

        # Split the data into training and test sets. (0.8, 0.2) split.
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Convert the split data back to DataFrames
        train = pd.DataFrame(X_train)
        train['spam'] = y_train
        test = pd.DataFrame(X_test)
        test['spam'] = y_test

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2024-07-29 12:03:37,182: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-29 12:03:37,184: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-29 12:03:37,185: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-29 12:03:37,187: INFO: common: created directory at: artifacts]
[2024-07-29 12:03:37,188: INFO: common: created directory at: artifacts/data_transformation]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 5728/5728 [03:43<00:00, 25.64it/s]
  values = values.astype(str)


[2024-07-29 12:07:27,374: INFO: 176530273: Splited data into training and test sets]
[2024-07-29 12:07:27,375: INFO: 176530273: (4582, 5001)]
[2024-07-29 12:07:27,376: INFO: 176530273: (1146, 5001)]
(4582, 5001)
(1146, 5001)
