In [73]:
import os
os.chdir(r'D:\Python\Industry level\NLP\NLP-IMDB-sentiment-analysis-End-to-end')
%pwd


'D:\\Python\\Industry level\\NLP\\NLP-IMDB-sentiment-analysis-End-to-end'

In [74]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: str
    transformed_data_file: str
    train_data_path: str
    test_data_path: str
    vectorizer_file: str

In [75]:
from sentiment_analysis.constants import *
from sentiment_analysis.utils.common import read_yaml, create_directories

In [76]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILEPATH
    ):
        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            transformed_data_file= config.transformed_data_file,
            train_data_path= config.train_data_path,
            test_data_path=config.test_data_path,
            vectorizer_file= config.vectorizer_file
        )
        return data_transformation_config

In [77]:
%pip install lxml

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import pickle
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re
import pandas as pd


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [78]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config = config
        self.stopwords = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer(max_features=5000)


    def preprocess_text(self,text):
        if not isinstance(text, str):
            return ""
        text = BeautifulSoup(text, "html.parser").get_text()
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stopwords]
        return ' '.join(tokens)
    
    def initiate_data_transformation(self, data_path: str):
        df = pd.read_csv(data_path)

        df['cleaned_review'] = df['review'].apply(self.preprocess_text)
        df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
        df['char_count'] = df['cleaned_review'].apply(lambda x: len(x))
        df['avg_word_length'] = df['char_count'] / df['word_count']

        # Vectorize
        X_tfidf = self.vectorizer.fit_transform(df['cleaned_review'])
        tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=self.vectorizer.get_feature_names_out())
        df = pd.concat([df[['sentiment']], tfidf_df], axis=1)  # Include target column + TF-IDF

        # Save full cleaned data
        df.to_csv(self.config.transformed_data_file, index=False)

        # Train-test split
        train_df, test_df = train_test_split(
            df, 
            test_size=0.2, 
            random_state=42,
            stratify=None  # Ensures balanced sentiment labels
        )

        train_df.to_csv(self.config.train_data_path, index=False)
        test_df.to_csv(self.config.test_data_path, index=False)

        # Save vectorizer
        with open(self.config.vectorizer_file, 'wb') as f:
            pickle.dump(self.vectorizer, f)

In [79]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.initiate_data_transformation('artifacts\data_ingestion\data_imdb.csv')
except Exception as e:
    raise e