<a href="https://colab.research.google.com/github/AKookani/NLP/blob/main/HW_GloVe_VS_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import tensorflow_datasets as tfds
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
# Load the IMDb dataset
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

# Split the dataset into training and testing sets
train_dataset, test_dataset = dataset['train'], dataset['test']

# Convert the datasets to lists of texts and labels
train_reviews = []
train_labels = []
for review, label in tfds.as_numpy(train_dataset):
    train_reviews.append(review.decode('utf-8'))
    train_labels.append(label)

test_reviews = []
test_labels = []
for review, label in tfds.as_numpy(test_dataset):
    test_reviews.append(review.decode('utf-8'))
    test_labels.append(label)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MPI9XA_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MPI9XA_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MPI9XA_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [6]:
# Tokenization and cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

processed_train_reviews = [preprocess(review) for review in train_reviews]
processed_test_reviews = [preprocess(review) for review in test_reviews]

In [7]:
# Word2Vec
word2vec_model = Word2Vec(sentences=processed_train_reviews, vector_size=100, window=5, sg=1, min_count=1)

In [8]:
# Download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2025-01-16 23:45:32--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-01-16 23:45:32--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-01-16 23:45:32--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [9]:
# Unzip the downloaded file
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [10]:
# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs  # Define word embedding for each word
    return embeddings_index

# Update the file path to the unzipped GloVe file
glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')

In [11]:
# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_train_reviews)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(processed_train_reviews)
test_sequences = tokenizer.texts_to_sequences(processed_test_reviews)

# Pad sequences
max_length = max(len(seq) for seq in train_sequences)
X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [12]:
def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector  # Assign embedding vector to its corresponding index
    return embedding_matrix

In [13]:
# GloVe embedding matrix
glove_embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, 100)

In [14]:
# Word2Vec embedding matrix
word2vec_embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        word2vec_embedding_matrix[i] = word2vec_model.wv[word]

In [15]:
# Build LSTM model
def build_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0],
                        output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [16]:
# GloVe model
glove_model = build_model(glove_embedding_matrix)

In [17]:
# Word2Vec model
word2vec_model = build_model(word2vec_embedding_matrix)

In [18]:
# Train and evaluate models
# apply at least 3 epochs while training the models
glove_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.2)
word2vec_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.2)

# Predictions
glove_predictions = glove_model.predict(X_test)
word2vec_predictions = word2vec_model.predict(X_test)

# Convert predictions to binary with a 0.5 threshold
glove_predictions = (glove_predictions > 0.5).astype(int)
word2vec_predictions = (word2vec_predictions > 0.5).astype(int)

# Evaluation metrics
metrics = ['accuracy', 'precision', 'recall', 'f1']
results = {}
for metric in metrics:
    results[f'glove_{metric}'] = eval(f'{metric}_score')(y_test, glove_predictions)
    results[f'word2vec_{metric}'] = eval(f'{metric}_score')(y_test, word2vec_predictions)

print(results)

Epoch 1/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m984s[0m 2s/step - accuracy: 0.4970 - loss: 0.6934 - val_accuracy: 0.4938 - val_loss: 0.6935
Epoch 2/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1019s[0m 2s/step - accuracy: 0.4958 - loss: 0.6933 - val_accuracy: 0.5062 - val_loss: 0.6931
Epoch 3/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m989s[0m 2s/step - accuracy: 0.4951 - loss: 0.6932 - val_accuracy: 0.5062 - val_loss: 0.6931
Epoch 1/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1020s[0m 2s/step - accuracy: 0.4893 - loss: 0.6935 - val_accuracy: 0.5062 - val_loss: 0.6931
Epoch 2/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m995s[0m 2s/step - accuracy: 0.4976 - loss: 0.6936 - val_accuracy: 0.5062 - val_loss: 0.6931
Epoch 3/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1014s[0m 2s/step - accuracy: 0.4920 - loss: 0.6933 - val_accuracy: 0.5062 - val_loss: 0.6933
[1m782/782[0m [32m━━━━