In [67]:
from google.colab import drive
drive.mount('./gdrive')

Drive already mounted at ./gdrive; to attempt to forcibly remount, call drive.mount("./gdrive", force_remount=True).


In [None]:
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoModelForSequenceClassification, TrainingArguments
import pandas as pd

In [None]:
import os
import pickle
from collections import defaultdict

DUMP_DIR = "./gdrive/MyDrive/dumps"

if not os.path.exists(DUMP_DIR):
    os.makedirs(DUMP_DIR, exist_ok = True)

from pathlib import Path

path = DUMP_DIR + '/adv-text.dat'


if not Path(path).is_file():
    # retrievedFiles.append(defaultdict(int))
    raise "File not found. Please generate adversarial text first"
# else:
with open(path, "rb") as f:
    adversarial_texts = pickle.load(f)
len(adversarial_texts)

7779

In [None]:
adversarial_texts[:2]

[[' sooo melancholic i will miss you here in san clemente!!! ',
  0.9918066,
  0,
  -1,
  ' sooo sad i will miss you here in san diego!!! '],
 ['my boss is bullying me... ',
  0.9966323,
  0,
  -1,
  'my boss is bullying me... ']]

## Load Tweets

In [None]:
original_neg_sentences = [i[4] for i in adversarial_texts if i[3] != -1]
adv_sentences = [i[0] for i in adversarial_texts if i[3] != -1]
len(original_neg_sentences), len(adv_sentences)

(5046, 5046)

In [None]:
df = pd.read_csv('./gdrive/MyDrive/data/tweets_preprocessed.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,label
0,cb774db0d1,"i`d have responded, if i were going","I`d have responded, if I were going",neutral,1
1,549e992a42,sooo sad i will miss you here in san diego!!!,Sooo SAD,negative,0
2,088c60f138,my boss is bullying me...,bullying me,negative,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,0
4,358bd9e861,"sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0


In [None]:
df.iloc[11415]['text']

'1am, quietly tidying up after dinner party. whilst closing tupperware lid, centre island in my kitchen collapses! wakes baby. mayhem ensues '

In [None]:
# replace text with adv

newText = []

for ind, row in df.iterrows():
    try:
        idx = original_neg_sentences.index(row['text'])
        if adversarial_texts[3] != -1:
          newText.append(adv_sentences[idx])
    except:
        newText.append(row['text'])

df['text'] = newText

In [None]:
df.iloc[11415]['text']

'1am, peaceably tidying up after dinner party. whilst closing tupperware lid, centre island in my kitchen crumble! wakes baby. mayhem ensues '

# Load model

In [None]:
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("MarieAngeA13/Sentiment-Analysis-BERT")
model = TFAutoModelForSequenceClassification.from_pretrained("./gdrive/MyDrive/tweet_classification")

model

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


<transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification at 0x7e589d3117b0>

In [None]:
df['text'] = df['text'].astype('str')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27480 entries, 0 to 27479
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
 4   label          27480 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


Now use predict function

In [None]:
def predict(model, tokenizer, text):
    embedding_matrix = model.bert.embeddings.weights[0]
    encoded_tokens = tokenizer(text, padding="max_length", max_length = 128, truncation=True, return_tensors="tf")
    token_ids = list(encoded_tokens["input_ids"].numpy()[0])
    vocab_size = embedding_matrix.get_shape()[0]

    # convert token ids to one hot
    token_ids_tensor = tf.constant([token_ids], dtype='int32')
    token_ids_tensor_one_hot = tf.one_hot(token_ids_tensor, vocab_size)

    inputs_embeds = tf.matmul(token_ids_tensor_one_hot, embedding_matrix)
    pred_scores = model({"inputs_embeds": inputs_embeds, "attention_mask": encoded_tokens["attention_mask"]}).logits
    max_class = tf.argmax(pred_scores, axis=1).numpy()[0]

    return max_class

In [None]:
pip install datasets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, test_size=.3, random_state=42)

In [None]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds["train"] = Dataset.from_pandas(pd.DataFrame(zip(X_train, y_train), columns=["text", "label"]))
ds["test"] = Dataset.from_pandas(pd.DataFrame(zip(X_test, y_test), columns=["text", "label"]))

ds["train"][10]

{'text': "ninja sushi for lunch but dominic's was out of sour gummy worms ",
 'label': 1}

In [None]:
predict(model, tokenizer, ds['test']['text'][0])

(2, 0.00034027753)

In [None]:
ds['test']['label'][0]

2

In [None]:
def calculate_accuracy(model, tokenizer, dataset):
    correct_predictions = 0
    total_samples = len(dataset["label"])

    for i in range(total_samples):
        text = dataset["text"][i]
        label = dataset["label"][i]

        # Make prediction
        predicted_class = predict(model, tokenizer, text)

        # Check accuracy
        if predicted_class == label:
            correct_predictions += 1
        if i % 50 == 0:
          print(f"{i + 1} items prediced")
    print(f'correct predictions: {correct_predictions}')
    print(f'total samples: {total_samples}')
    accuracy = correct_predictions / total_samples
    return accuracy

In [66]:
accuracy = calculate_accuracy(model, tokenizer, ds['test'])

1 items prediced
51 items prediced
101 items prediced
151 items prediced
201 items prediced
251 items prediced
301 items prediced
351 items prediced
401 items prediced
451 items prediced
501 items prediced
551 items prediced
601 items prediced
651 items prediced
701 items prediced
751 items prediced
801 items prediced
851 items prediced
901 items prediced
951 items prediced
1001 items prediced
1051 items prediced
1101 items prediced
1151 items prediced
1201 items prediced
1251 items prediced
1301 items prediced
1351 items prediced
1401 items prediced
1451 items prediced
1501 items prediced
1551 items prediced
1601 items prediced
1651 items prediced
1701 items prediced
1751 items prediced
1801 items prediced
1851 items prediced
1901 items prediced
1951 items prediced
2001 items prediced
2051 items prediced
2101 items prediced
2151 items prediced
2201 items prediced
2251 items prediced
2301 items prediced
2351 items prediced
2401 items prediced
2451 items prediced
2501 items prediced
255

In [None]:
accuracy

0.7029702970297029