In [1]:
import tarfile
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Define file paths
tar_file_path = '/content/drive/My Drive/aclImdb_v1.tar.gz'
extracted_path = '/content/aclImdb'


In [4]:
# Extract tar file
with tarfile.open(tar_file_path, 'r') as tar_ref:
    tar_ref.extractall(extracted_path)

In [5]:
# Directories for train and test data
data_dir = os.path.join(extracted_path, 'aclImdb')
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')


In [6]:
# Load data from directory
def load_data_from_directory(directory):
    texts = []
    labels = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(directory, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname), 'r', encoding='utf-8') as f:
                    texts.append(f.read())
                labels.append(0 if label_type == 'neg' else 1)
    return texts, labels

In [7]:
# Load training and testing data
train_texts, train_labels = load_data_from_directory(train_dir)
test_texts, test_labels = load_data_from_directory(test_dir)

In [8]:
# Create DataFrames for better visualization
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

# Display first few rows of the DataFrames
print(train_df.head())
print(test_df.head())

                                                text  label
0  The plot of 7EVENTY 5IVE involves college kids...      0
1  This movie was a mess. It had the absolute wor...      0
2  This is definitely one of the weirder 70's mov...      0
3  - A Mexican priest becomes a wrestler to save ...      0
4  I saw the film tonight at a free preview scree...      0
                                                text  label
0  OK everybody is so enthused by this film I har...      0
1  It beats me how anyone can rate this film very...      0
2  First off, I saw another reviewer said this mo...      0
3  OK, I've now seen George Zucco in at least fou...      0
4  Spoof films have come so far since Mel Brooks ...      0


In [9]:
#  label distribution
print("Training data label distribution:")
print(f"Positive reviews: {np.sum(train_labels)}")
print(f"Negative reviews: {len(train_labels) - np.sum(train_labels)}")

print("Testing data label distribution:")
print(f"Positive reviews: {np.sum(test_labels)}")
print(f"Negative reviews: {len(test_labels) - np.sum(test_labels)}")

Training data label distribution:
Positive reviews: 12500
Negative reviews: 12500
Testing data label distribution:
Positive reviews: 12500
Negative reviews: 12500


In [10]:
# Tokenize text data
max_features = 20000
max_len = 250
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_texts)

In [11]:
# Convert text to sequences of integers
x_train = tokenizer.texts_to_sequences(train_texts)
x_test = tokenizer.texts_to_sequences(test_texts)

In [12]:
# Pad sequences to ensure uniform input length
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)


In [13]:
# Convert labels to numpy arrays
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [14]:
# Split training data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Display data shapes
print("Training data shape:", x_train.shape)
print("Validation data shape:", x_val.shape)
print("Test data shape:", x_test.shape)

Training data shape: (20000, 250)
Validation data shape: (5000, 250)
Test data shape: (25000, 250)


In [15]:
# Train a Word2Vec model
sentences = [text.split() for text in train_texts]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [16]:
# Create an embedding matrix
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embedding_dim)

print("Word2Vec embedding matrix created.")

Word2Vec embedding matrix created.
