# Sentiment Analysis with BERT

In [None]:
# Reference for our model: https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671#:~:text=Sentiment%20Analysis%20with%20BERT&text=Load%20the%20BERT%20Classifier%20and,with%20the%20Fine%2Dtuned%20Model

In [None]:
# Installing transformers
pip install transformers

In [None]:
# Setup
# We will build our model with the pre-trained BERT tokenizer and sequence classifier.
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
import numpy as np
import csv
from random import shuffle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Setting file paths for training data and testing data
pos_file_path = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/train_pos.txt"
neg_file_path = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/train_neg.txt"
test_file_path = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/test_data.txt"
output_dir = '/content/drive/MyDrive/Colab Notebooks/output/'

In [None]:
with open(pos_file_path) as f:
    documents1 = []
    for line in f:
        documents1.append(line)

with open(neg_file_path) as f:
    documents2 = []
    for line in f:
        documents2.append(line)

In [None]:
# Using 100000 positive tweets and 100000 negative tweets
pos = np.c_[np.array(documents1), np.ones(100000).astype(int)]
neg = np.c_[np.array(documents2), np.zeros(100000).astype(int)]

data = np.concatenate((pos,neg))
np.random.shuffle(data)

In [None]:
# Splitting data into training set and testing set
train = data[0:int(0.8*len(data))]
test = data[int(0.8*len(data)):]

# Convert to Pandas dataframe
train_feat = train[:,0]
train_lab = train[:,1]
train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

test_feat = test[:,0]
test_lab = test[:,1]
test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

In [None]:
# Build model using pre-trained BERT tokenizer and sequence classifier 
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# Summary of our BERT model
model.summary()

In [None]:
# Converts each row of our datasets into an InputExample object
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
    return train_InputExamples, validation_InputExamples

In [None]:
# Creates an input dataset that can be fed into the model
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):

    features = [] 

    for e in examples:
        
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default 
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )
      
    def gen():
      for f in features:
        yield ({
                "input_ids": f.input_ids,
                "attention_mask": f.attention_mask,
                "token_type_ids": f.token_type_ids,
                },
                f.label,)

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [None]:
# Configuring the BERT model
## Optimizer: Adam
## Accuracy metric: SparseCategoricalAccuracy
## Loss function: CategoricalCrossentropy

tf.debugging.set_log_device_placement(True)

try:
  # Specify an invalid GPU device
  with tf.device('/device:GPU:2'):
    train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)
    train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
    train_data = train_data.shuffle(100).batch(64).repeat(2)

    validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
    validation_data = validation_data.batch(64)


    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
    model.fit(train_data, epochs=2, validation_data=validation_data)
    
except RuntimeError as e:
  print(e)

In [None]:
# Actual testing data 
with open(test_file_path) as f:
    documents3 = []
    for line in f:
      line = ','.join(line.split(',')[1:])
      documents3.append(line)

In [None]:
# Making predictions using our model
tf_batch = []
tf_outputs = []
labels = [0,1]
labels = []

for k,i in enumerate(documents3):
  if k%50 == 0:
    print(k)
  tf_batch = tokenizer(i, max_length=128, padding=True, truncation=True, return_tensors='tf')
  tf_output = model(tf_batch)
  tf_predictions = tf.nn.softmax(tf_output[0], axis=-1)
  label = tf.argmax(tf_predictions, axis=1)
  label = label.numpy()
  labels.append(label)

labels

In [None]:
# Create submission
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in .csv format for submission
    Arguments: ids (event ids)
          y_pred (predicted labels)
          name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        names = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=names)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

y_pred = np.array(labels) * 2 - 1         # when Classes: {'1': 1, '0': 0}
y_pred = list(y_pred)
create_csv_submission(np.arange(1, len(y_pred)+1), y_pred, output_dir+'submission_bert_100000.csv')

y_pred