In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

def preprocess_data():
    # Load data
    df = pd.read_csv('../data/raw/IMDB Dataset.csv')
    
    # Clean reviews
    df['cleaned_review'] = df['review'].apply(clean_text)
    
    # Convert sentiment to binary (0 for negative, 1 for positive)
    df['sentiment_label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    
    # Split data
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    # Tokenization
    tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
    tokenizer.fit_on_texts(train_df['cleaned_review'])
    
    # Convert texts to sequences
    train_sequences = tokenizer.texts_to_sequences(train_df['cleaned_review'])
    val_sequences = tokenizer.texts_to_sequences(val_df['cleaned_review'])
    test_sequences = tokenizer.texts_to_sequences(test_df['cleaned_review'])
    
    # Padding sequences
    max_length = 200
    train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
    val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')
    
    # Save processed data
    processed_data = {
        'train': {
            'texts': train_padded,
            'labels': train_df['sentiment_label'].values
        },
        'val': {
            'texts': val_padded,
            'labels': val_df['sentiment_label'].values
        },
        'test': {
            'texts': test_padded,
            'labels': test_df['sentiment_label'].values
        },
        'tokenizer': tokenizer
    }
    
    with open('../data/processed/processed_data.pkl', 'wb') as f:
        pickle.dump(processed_data, f)
    
    print("Data preprocessing completed and saved.")

if __name__ == '__main__':
    preprocess_data()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data preprocessing completed and saved.
