# **Homework 12 - Bert LLM**
# KDD Tuesdays 12:30 PM - 2:45 PM
## Jake Brulato

## **Extract Data from SNLI Tar file**

In [1]:
import tarfile
import pandas as pd
import os

# Extract the SNLI dataset
tar_path = 'SNLI_Corpus.tar.gz'
extract_path = 'SNLI_Corpus'
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

# List files in the extracted directory
os.listdir(extract_path)


['SNLI_Corpus']

In [2]:
# List files inside the SNLI_Corpus folder
os.listdir(os.path.join(extract_path, 'SNLI_Corpus'))

['snli_1.0_dev.csv', 'snli_1.0_test.csv', 'snli_1.0_train.csv']

## **Assign the listed files to Train and Validation Data**

In [3]:
# Load training and development data
train_path = os.path.join(extract_path, 'SNLI_Corpus', 'snli_1.0_train.csv')
dev_path = os.path.join(extract_path, 'SNLI_Corpus', 'snli_1.0_dev.csv')

train_df = pd.read_csv(train_path)
dev_df = pd.read_csv(dev_path)

# Preview the training data
train_df.head()

Unnamed: 0,similarity,sentence1,sentence2
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,neutral,Children smiling and waving at camera,They are smiling at their parents
4,entailment,Children smiling and waving at camera,There are children present


## **Filter for 10,000 Samples**

In [4]:
# Filter the data for valid labels and limit to the first 10,000 samples
valid_labels = ['entailment', 'contradiction', 'neutral']
train_df = train_df[train_df['similarity'].isin(valid_labels)].head(10000)

# Encode labels as categorical integers
label_mapping = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
train_df['label'] = train_df['similarity'].map(label_mapping)

# Check the size of the filtered training data and display the first few rows
train_df.shape, train_df.head()

((10000, 4),
       similarity                                          sentence1  \
 0        neutral  A person on a horse jumps over a broken down a...   
 1  contradiction  A person on a horse jumps over a broken down a...   
 2     entailment  A person on a horse jumps over a broken down a...   
 3        neutral              Children smiling and waving at camera   
 4     entailment              Children smiling and waving at camera   
 
                                            sentence2  label  
 0  A person is training his horse for a competition.      2  
 1      A person is at a diner, ordering an omelette.      1  
 2                  A person is outdoors, on a horse.      0  
 3                  They are smiling at their parents      2  
 4                         There are children present      0  )

## **Check to see if Tensorflow and Transformers are installed**

In [5]:
pip install tensorflow transformers

Note: you may need to restart the kernel to use updated packages.


## **Map the loaded datasets and then tokenize from the pre-trained bert-base-uncased**

In [6]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# Load the dataset
train_df = pd.read_csv('/Users/jakebrulato/Documents/GitHub/KDD/Homework_12/SNLI_Corpus/SNLI_Corpus/snli_1.0_train.csv')
train_df = train_df[train_df['similarity'].isin(['entailment', 'contradiction', 'neutral'])].head(10000)

# Map labels to integers
label_mapping = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
train_df['label'] = train_df['similarity'].map(label_mapping)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def bert_encode(sentences, max_len=128):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True,
            return_attention_mask=True, return_tensors='tf')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return input_ids, attention_masks

sentence1 = bert_encode(train_df['sentence1'].values)
sentence2 = bert_encode(train_df['sentence2'].values)
labels = train_df['label'].values


  from .autonotebook import tqdm as notebook_tqdm
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2024-04-16 13:45:04.246259: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## **Create the BiLSTM Layer and Classifer for model**

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Bidirectional, TimeDistributed, AveragePooling1D, Flatten
from transformers import TFBertModel

# Load pretrained BERT
bert = TFBertModel.from_pretrained('bert-base-uncased')
bert.trainable = False  # Freeze BERT

# Inputs
input_ids = Input(shape=(128,), dtype='int32')
attention_masks = Input(shape=(128,), dtype='int32')

# BERT embeddings
embeddings = bert(input_ids, attention_mask=attention_masks)[0]

# BiLSTM layer
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embeddings)
avg_pool = AveragePooling1D(pool_size=4, strides=1)(bilstm)
flat = Flatten()(avg_pool)

# Classifier
classifier = Dense(3, activation='softmax')(flat)

# Construct model
model = Model(inputs=[input_ids, attention_masks], outputs=classifier)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summarize model
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_1[0][0]',             
 )                           ngAndCrossAttentions(last_   40         'input_2[0][0]']             
                             hidden_state=(None, 128, 7                                           
                             68),                                                             

## **Train the model for Train and Validation Accuracy Loss**

In [8]:
import tensorflow as tf

def bert_encode(sentences, tokenizer, max_len=128):
    all_input_ids = []
    all_attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',  # Make sure to use 'tf' to get TensorFlow tensors directly
        )
        
        all_input_ids.append(encoded_dict['input_ids'][0])  # Access the tensor inside the batch
        all_attention_masks.append(encoded_dict['attention_mask'][0])

    # Convert lists to tensors
    all_input_ids = tf.stack(all_input_ids, axis=0)
    all_attention_masks = tf.stack(all_attention_masks, axis=0)

    return all_input_ids, all_attention_masks

# Usage
input_ids, attention_masks = bert_encode(train_df['sentence1'].values, tokenizer)
labels = train_df['label'].values

print('Input IDs shape:', input_ids.shape)
print('Attention Masks shape:', attention_masks.shape)
print('Labels shape:', labels.shape)

# If shapes are correct, proceed to train
history = model.fit([input_ids, attention_masks], labels, batch_size=32, epochs=1, validation_split=0.1)


Input IDs shape: (10000, 128)
Attention Masks shape: (10000, 128)
Labels shape: (10000,)


## **Print Desired Outputs**

In [9]:
print("Training Loss:", history.history['loss'])
print("Training Accuracy:", history.history['accuracy'])
print("Validation Loss:", history.history['val_loss'])
print("Validation Accuracy:", history.history['val_accuracy'])


Training Loss: [1.1252553462982178]
Training Accuracy: [0.30666667222976685]
Validation Loss: [1.0991119146347046]
Validation Accuracy: [0.33500000834465027]


In [10]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Bidirectional, AveragePooling1D, Flatten
from sklearn.model_selection import train_test_split

# Load the dataset
train_df = pd.read_csv('/Users/jakebrulato/Documents/GitHub/KDD/Homework_12/SNLI_Corpus/SNLI_Corpus/snli_1.0_train.csv')
train_df = train_df[train_df['similarity'].isin(['entailment', 'contradiction', 'neutral'])].head(10000)

# Map labels to integers
label_mapping = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
train_df['label'] = train_df['similarity'].map(label_mapping)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to encode sentences
def bert_encode(sentences, tokenizer, max_len=128):
    all_input_ids = []
    all_attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        all_input_ids.append(encoded_dict['input_ids'][0])
        all_attention_masks.append(encoded_dict['attention_mask'][0])

    all_input_ids = tf.stack(all_input_ids, axis=0)
    all_attention_masks = tf.stack(all_attention_masks, axis=0)

    return all_input_ids, all_attention_masks

# Encode the sentences
input_ids, attention_masks = bert_encode(train_df['sentence1'].values, tokenizer)
labels = train_df['label'].values

# Load the pretrained BERT model
bert = TFBertModel.from_pretrained('bert-base-uncased')
bert.trainable = False  # Freeze BERT

# Define the model architecture
input_ids_layer = Input(shape=(128,), dtype='int32', name='input_ids')
attention_masks_layer = Input(shape=(128,), dtype='int32', name='attention_masks')

# BERT embeddings
embeddings = bert(input_ids_layer, attention_mask=attention_masks_layer)[0]

# BiLSTM layer
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embeddings)
avg_pool = AveragePooling1D(pool_size=4, strides=1)(bilstm)
flat = Flatten()(avg_pool)

# Classifier
classifier = Dense(3, activation='softmax')(flat)

# Construct the model
model = Model(inputs=[input_ids_layer, attention_masks_layer], outputs=classifier)

# Compile the model
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Training the model
history = model.fit([input_ids, attention_masks], labels, 
                    batch_size=32, epochs=1, validation_split=0.1)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBer



In [11]:
print("Training Loss:", history.history['loss'])
print("Training Accuracy:", history.history['accuracy'])
print("Validation Loss:", history.history['val_loss'])
print("Validation Accuracy:", history.history['val_accuracy'])

Training Loss: [1.130107045173645]
Training Accuracy: [0.3118889033794403]
Validation Loss: [1.0989935398101807]
Validation Accuracy: [0.3310000002384186]
