<a href="https://colab.research.google.com/github/Brotherswords/Sentiment-Analysis-Recall-Assignment/blob/main/CSCI_4931_Assignment_3_Vivekanandasarma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.metrics import Precision, Recall
import tensorflow as tf
import os
import zipfile
import numpy as np
import pandas as pd
import numpy as np
import pickle

In [None]:
#Check for the file I need
drive.mount('/content/drive')
directory_path = '/content/drive/My Drive/Projects Things Useful/University/CU Denver 2023-2024 Sem 1/Deep Learning/Assignment_3_Data'
pickle_path = directory_path + "/Pickle_Files"
# Check if the directory exists
if os.path.exists(directory_path):
    # List all files and directories in the specified path
    files = os.listdir(directory_path)
    print("Files and directories in '", directory_path, "' :")
    for i in files:
      print(i)
else:
    print("The directory does not exist")

train_path = directory_path + '/train.csv'
test_path = directory_path + '/test.csv'
validation_path = directory_path + '/validation.csv'

Mounted at /content/drive
Files and directories in ' /content/drive/My Drive/Projects Things Useful/University/CU Denver 2023-2024 Sem 1/Deep Learning/Assignment_3_Data ' :
train.csv
validation.csv
test.csv
BERT_assignment_3_version_1.h5
BERT_assignment_3_version_3.h5


# Approach 1 (for fun not really for the assignment haha): RandomForestClassifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
import numpy as np

# Load data
train_df = pd.read_csv(directory_path + '/train.csv')
validation_df = pd.read_csv(directory_path + '/validation.csv')
test_df = pd.read_csv(directory_path + '/test.csv')

# Preprocess and feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['Tweet'])
y_train = train_df.iloc[:, 2:]

X_test = vectorizer.transform(test_df['Tweet'])
y_test = test_df.iloc[:, 2:]

# Model training
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100))
model.fit(X_train, y_train)



In [None]:
# Prediction and evaluation on test data
predictions = model.predict(X_test)
recall_scores = recall_score(y_test, predictions, average=None)
print('Recall scores per class:', recall_scores)
micro_recall = recall_score(y_test, predictions, average='micro')
print('Micro Recall:', micro_recall)
count = len([i for i in recall_scores if i >= 0.50])
print(count,"of the",len(recall_scores),"labels have a recall over 50")



Recall scores per class: [0.48319709 0.01647059 0.41674249 0.55257732 0.59916782 0.24418605
 0.34470691 0.048      0.32708333 0.06470588 0.        ]
2 of the 11 labels have a recall over 50
Micro Recall: 0.3802262040920066


Yikes! 😱 We can probably do better with something else.

# Approach 2 (for real this time): LSTM

## 2.1 Lemmatizing/Removing Stopwords

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]

    # Reconstruct the sentence
    return ' '.join(lemmatized_tokens)

# Assuming you have a dataframe 'df' with a column 'Tweet'
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_validation = pd.read_csv(validation_path)
df_train['Tweet'] = df_train['Tweet'].apply(preprocess_text)
df_test['Tweet'] = df_test['Tweet'].apply(preprocess_text)
df_validation['Tweet'] = df_validation['Tweet'].apply(preprocess_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Step 2.2 Text Vectorization & Padding

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization and Vectorization
max_vocab = 10000  # Number of unique words to consider
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(df_train['Tweet'])

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(df_train['Tweet'])
X_test_seq = tokenizer.texts_to_sequences(df_test['Tweet'])
X_val_seq = tokenizer.texts_to_sequences(df_validation['Tweet'])

# Calculate the length of each sequence in the training set
sequence_lengths = [len(seq) for seq in X_train_seq]

# Determine the 90th percentile of these lengths
max_sequence_length = int(np.percentile(sequence_lengths, 90))

# Padding sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_sequence_length)


## Step 2.3 Preparing the Labels

In [None]:
y_train = df_train.iloc[:, 2:].values
y_test = df_test.iloc[:, 2:].values
y_val = df_validation.iloc[:, 2:].values

# Assuming the first two columns are 'ID' and 'Tweet', and the rest are labels
num_labels = len(df_train.columns) - 2
print("num_labels",num_labels)

num_labels 11


## Step 2.4 Bulding the LSTM Model

Building the Embedding Matrix with word2vec

In [None]:
import gensim.downloader as api

# Load Google's pre-trained Word2Vec model.
word2vec_model = api.load("word2vec-google-news-300")



In [None]:
import numpy as np
import pickle

embedding_dim = 300  # Dimension of Google's Word2Vec embeddings
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 for padding token

# Initialize the embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_vector = word2vec_model[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [None]:
pickle_file_path = directory_path + 'embedding_matrix.pkl'

# Write the embedding matrix to the pickle file
with open(pickle_file_path, 'wb') as file:
    pickle.dump(embedding_matrix, file)

print(f"Embedding matrix saved to {pickle_file_path}")

Embedding matrix saved to /content/drive/My Drive/Projects Things Useful/University/CU Denver 2023-2024 Sem 1/Deep Learning/Assignment_3_Dataembedding_matrix.pkl


Actual model architecture (based off of my quiz 5 model)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.metrics import Precision, Recall



model = Sequential()
# Add the pre-loaded embedding layer
model.add(Embedding(input_dim=max_vocab, output_dim=128, input_length=max_sequence_length))
# Add an LSTM layer
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dropout(0.3))
model.add(Dense(units=128, activation='relu'))
# Output layer with a sigmoid activation for multi-label classification
model.add(Dense(units=num_labels, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Precision(), Recall()])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 14, 128)           1280000   
                                                                 
 bidirectional_4 (Bidirecti  (None, 14, 256)           263168    
 onal)                                                           
                                                                 
 dropout_542 (Dropout)       (None, 14, 256)           0         
                                                                 
 bidirectional_5 (Bidirecti  (None, 128)               164352    
 onal)                                                           
                                                                 
 dropout_543 (Dropout)       (None, 128)               0         
                                                                 
 dense_31 (Dense)            (None, 128)              

## Step 2.5 Training the model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Callback for early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

epochs = 30
batch_size = 32

# Train the model
history = model.fit(X_train_pad, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_pad, y_val),
                    callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


Saving the model 🎊

In [None]:
#Version 1 best performing rn
model_filename = "LSTM_assignment_3_version_4.h5"
full_path = directory_path + model_filename
model.save(full_path)

  saving_api.save_model(


## Step 2.5 Test the model

In [None]:
import tensorflow as tf
import numpy as np

model = tf.keras.models.load_model(full_path)

# Evaluate the model on the test data
# Assuming X_test_pad and y_test are your padded test features and labels
evaluation_results = model.evaluate(X_test_pad, y_test)
test_loss = evaluation_results[0]
test_accuracy = evaluation_results[1]

# Get model predictions
predictions = model.predict(X_test_pad)
# Convert predictions to binary (0 or 1)
binary_predictions = np.round(predictions)




In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score

micro_recall = recall_score(y_test, binary_predictions, average='micro')
print("Micro Recall:", micro_recall)

# Calculate micro precision
micro_precision = precision_score(y_test, binary_predictions, average='micro')
print("Micro Precision:", micro_precision)

# Calculate micro F1-score
micro_f1 = f1_score(y_test, binary_predictions, average='micro')
print("Micro F1-Score:", micro_f1)

Micro Recall: 0.5061634261024273
Micro Precision: 0.6827219746314707
Micro F1-Score: 0.5813325549149821


Per-Class Recalls()

In [None]:
from sklearn.metrics import recall_score

# Predict on the test set
predictions = model.predict(X_test_pad)
binary_predictions = np.round(predictions)

# Calculate recall for each class individually
class_recall = recall_score(y_test, binary_predictions, average=None)

# Print recall for each class
emotion_classes = df_train.columns[2:]  # Assuming the first two columns are not labels
over_50 = 0
for emotion, recall in zip(emotion_classes, class_recall):
    print(f"Recall for {emotion}: {recall}")
    if recall >= 0.5:
      over_50 += 1
print(over_50,"of the",num_labels,"labels have a recall over 50")


Recall for anger: 0.620345140781108
Recall for anticipation: 0.002352941176470588
Recall for disgust: 0.5832575068243858
Recall for fear: 0.5010309278350515
Recall for joy: 0.6456310679611651
Recall for love: 0.27325581395348836
Recall for optimism: 0.6019247594050744
Recall for pessimism: 0.192
Recall for sadness: 0.6072916666666667
Recall for surprise: 0.0
Recall for trust: 0.0
6 of the 11 labels have a recall over 50


### LETS GO WE GOT 6 OF 11 🎊

# Approach 3: BERT Time 😈

## Step 3.1 Loading BERT Model & Tokenizer

In [None]:
from transformers import BertTokenizer, TFBertModel

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

## Step 3.2 Tokenize and Prep Data for BERT!

In [None]:
# Load data
train_df = pd.read_csv(directory_path + '/train.csv')
validation_df = pd.read_csv(directory_path + '/validation.csv')
test_df = pd.read_csv(directory_path + '/test.csv')

# Calculate the 90th percentile of sequence lengths after tokenizing the tweets
# Note: You might want to adjust this as BERT has a maximum token limit (typically 512 tokens)
tokenized_tweets = [tokenizer.encode(tweet, add_special_tokens=True) for tweet in train_df['Tweet']]
sequence_lengths = [len(tokens) for tokens in tokenized_tweets]
max_length = min(int(np.percentile(sequence_lengths, 90)), 512)

def encode_tweets(tweets):
    return tokenizer.batch_encode_plus(
        tweets,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    ).data  # Only extract needed data (input_ids and attention_mask)


# Encode the tweets using the correct DataFrame variable names
train_encodings = encode_tweets(train_df['Tweet'].tolist())
test_encodings = encode_tweets(test_df['Tweet'].tolist())
validation_encodings = encode_tweets(validation_df['Tweet'].tolist())


## Step 3.3 Prepare the Labels

In [None]:
y_train = train_df.iloc[:, 2:].values
y_test = test_df.iloc[:, 2:].values
y_val = validation_df.iloc[:, 2:].values

## Step 3.4 Create TensorFlow Datasets

In [None]:
# Prepare the data for training
train_inputs = {key: val.numpy() for key, val in train_encodings.items() if key in ['input_ids', 'attention_mask']}
validation_inputs = {key: val.numpy() for key, val in validation_encodings.items() if key in ['input_ids', 'attention_mask']}
test_inputs = {key: val.numpy() for key, val in test_encodings.items() if key in ['input_ids', 'attention_mask']}




## Step 3.5 Build the BERT-Based Model

In [None]:
pip install tensorflow-addons


Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/611.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/611.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.23.0 typeguard-2.13.3


In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1_l2
from transformers import TFBertModel
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf

# BERT input
input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_masks = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

# Load the BERT model with custom layer unfreezing
bert = TFBertModel.from_pretrained('bert-base-uncased')
# Freeze all layers first
for layer in bert.layers:
    layer.trainable = False

# # Unfreeze the top N layers
# N = 1  # Define the number of layers to unfreeze
# for layer in bert.layers[-N:]:
#     layer.trainable = True

# BERT embeddings
bert_outputs = bert(input_ids, attention_mask=attention_masks)
bert_output = bert_outputs[1]  # Use the pooled output for classification tasks

# Additional layers with L1/L2 regularization and Layer Normalization
x = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(bert_output)
x = LayerNormalization()(x)
x = Dropout(0.1)(x)

x = Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(x)
x = LayerNormalization()(x)
x = Dropout(0.1)(x)

x = Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(x)
x = LayerNormalization()(x)
x = Dropout(0.1)(x)

# Output layer
output = Dense(11, activation='sigmoid')(x)

# Construct the model
model = Model(inputs=[input_ids, attention_masks], outputs=output)

# Use AdamW optimizer and learning rate scheduler
optimizer = Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[Precision(), Recall()])

# Add a learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

callback = LearningRateScheduler(scheduler)

model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 40)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 40)]                 0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_11 (TFBertMo  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 del)                        ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 40, 76                                     

## Step 3.6 Train the BERT model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import load_model
# Callback for early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
epochs = 50
batch_size = 32

# Train the model
history = model.fit(
    train_inputs, y_train,
    epochs=epochs,
    validation_data=(validation_inputs, y_val),
    callbacks=[early_stopping],
    batch_size=batch_size,
    verbose=1
)

# Save the model
# Verion 6 currently best performing
model_filename = "/BERT_assignment_3_version_7.h5"
full_path = directory_path + model_filename
model.save(full_path)
print("Training Complete")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


Training Complete


## Step 3.7 Evaluating on Test Set

In [None]:
from tensorflow.keras.models import load_model
from transformers import TFBertModel
from sklearn.metrics import recall_score
import numpy as np

# model_filename = "/BERT_assignment_3_version_4.h5"
# full_path = directory_path + model_filename

# Custom object for loading the model
custom_objects = {"TFBertModel": TFBertModel}

# Load the model with custom objects
with tf.keras.utils.custom_object_scope(custom_objects):
    print("Loading from", full_path)
    model = load_model(full_path)

# Now you can use your model to predict and evaluate
test_predictions = model.predict(test_inputs)
binary_predictions = np.round(test_predictions)

Loading from /content/drive/My Drive/Projects Things Useful/University/CU Denver 2023-2024 Sem 1/Deep Learning/Assignment_3_Data/BERT_assignment_3_version_7.h5






In [None]:

# Calculate recall for each class
class_recalls = recall_score(y_test, binary_predictions, average=None)

# Print recall for each class
emotion_classes = train_df.columns[2:]  # Assuming the first two columns are not labels
for emotion, recall in zip(emotion_classes, class_recalls):
    print(f"Recall for {emotion}: {recall}")

over_50 = len([i for i in class_recalls if i >= 0.50])
print(over_50,"of the",len(class_recalls),"labels have a recall over 50")



Recall for anger: 0.5967302452316077
Recall for anticipation: 0.0
Recall for disgust: 0.6069153776160146
Recall for fear: 0.016494845360824743
Recall for joy: 0.7337031900138696
Recall for love: 0.0755813953488372
Recall for optimism: 0.6202974628171478
Recall for pessimism: 0.0
Recall for sadness: 0.18020833333333333
Recall for surprise: 0.0
Recall for trust: 0.0
4 of the 11 labels have a recall over 50
