In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [2]:
# Load the dataset
data = pd.read_csv('../data/raw/IMDB Dataset.csv')

### Data Cleaning and Preprocessing

In [3]:
# Initialize lemmatizer and stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
# Function to clean and preprocess the reviews
def preprocess_review(review):
    # Remove HTML tags and special characters
    review = re.sub(r'<.*?>', ' ', review)  # Remove HTML tags
    review = re.sub(r'[^a-zA-Z\s]', '', review)  # Remove special characters
    review = review.lower().strip()  # Convert to lowercase and strip whitespace

    # Tokenization
    words = review.split()

    # Stopword removal and lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

In [5]:
# Apply preprocessing to the review column
data['cleaned_review'] = data['review'].apply(preprocess_review)

In [6]:
# Display the first few rows of the cleaned data
data[['review', 'cleaned_review', 'sentiment']].sample(10)

Unnamed: 0,review,cleaned_review,sentiment
12723,"Going by the good words of my friends, I hired...",going good word friend hired movie hoping woul...,negative
42624,******************SPOILER********************S...,spoilerspoiler movie stunk let say totally agr...,negative
10899,Even though Ian Kershaw distanced himself from...,even though ian kershaw distanced project due ...,positive
37869,Someone told me that this was one of the best ...,someone told one best adult movie date since d...,negative
16926,"As others have said, ""No, Luciano"" is a more a...",others said luciano apt title response movie t...,negative
2365,"There is so much that is wrong with this film,...",much wrong film sum terrible acting bad must p...,negative
26284,"Is this a bad movie?<br /><br />Of course, wha...",bad movie course expecting movie called beach ...,positive
6763,"Now, i hired this movie because Brad Dourif wa...",hired movie brad dourif excellent actor brilli...,negative
20386,Rented this out from my local because it was t...,rented local new british film available week n...,positive
15725,This movie really shows its age. The print I s...,movie really show age print saw terrible due a...,negative


### Feature Extraction

In [7]:
# Prepare features and labels
X = data['cleaned_review']  # Features (cleaned reviews)
y = data['sentiment']  # Labels (positive/negative)

In [8]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [9]:
# Train-Test-Validation Split
X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
# Save the datasets
train_data = pd.DataFrame(X_train.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
train_data['sentiment'] = y_train.reset_index(drop=True)

val_data = pd.DataFrame(X_val.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
val_data['sentiment'] = y_val.reset_index(drop=True)

test_data = pd.DataFrame(X_test.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
test_data['sentiment'] = y_test.reset_index(drop=True)

In [12]:
# Define the paths for saving the zip files
train_zip_path = '../data/processed/train_data.zip'
val_zip_path = '../data/processed/val_data.zip'
test_zip_path = '../data/processed/test_data.zip'

# Save the datasets as CSV files inside a zip file
train_data.to_csv(train_zip_path, index=False, compression={'method': 'zip', 'archive_name': 'train_data.csv'})
val_data.to_csv(val_zip_path, index=False, compression={'method': 'zip', 'archive_name': 'val_data.csv'})
test_data.to_csv(test_zip_path, index=False, compression={'method': 'zip', 'archive_name': 'test_data.csv'})

In [13]:
# Display the shapes of the splits
print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (40000, 5000)
Validation data shape: (5000, 5000)
Testing data shape: (5000, 5000)


In [14]:
# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.joblib')

['../models/tfidf_vectorizer.joblib']