In [2]:
!pip install transformers
!pip install xlsxwriter

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 6.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 1.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 74.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 33.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer,TrainingArguments
from transformers import DistilBertTokenizerFast, BertForMaskedLM
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
from torch import nn
from transformers import Trainer

# from dataset import load_metric

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import xlsxwriter


import time

In [4]:
class SarcasimDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
## Test Dataset
class SarcasimTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)

In [5]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1, "recall": recall, 'precision': precision}

In [None]:
from random import sample, random, shuffle
from math import ceil
import json

class TextMutant():
    def __init__(self):
        synm_filepath = '/content/drive/MyDrive/isarcasm/isarcasm_datasets/synm3.json'

        with open(synm_filepath) as json_file:
            self.synoynms = json.load(json_file)
    
    def remove_words(self, sentence, mode = "k-random", k = 0.1, prob = 0.1):
        word_list = sentence.split()
        new_words = []
        if mode == "prob":
            for i in word_list:
                if (random() < prob):
                    continue
                new_words.append(i)
        if mode == "k-random":
            num = round(len(word_list) * k)
            random_index = sample(list(range(len(word_list))), num)
            new_words = list( word_list[i] for i in range(len(word_list)) if i not in random_index )
        return " ".join(new_words)

    def shuffle_words(self, sentence, prob = 0.1):
        word_list = sentence.split()
        indexes = list(range(len(word_list)))
        if (random() < prob):
            shuffle(indexes)
        new_words = list( word_list[i] for i in indexes )
        return " ".join(new_words)
    
    def replace_words(self, sentence, mode = "k-random", k = 0.1, prob = 0.1):
        word_list = sentence.split()
        new_words = []
        if mode == "prob":
            for i in word_list:
                self.synoynms.setdefault(i, [])
                if (random() < prob and len(self.synoynms[i]) > 0):
                    new_words.append(sample(self.synoynms[i], 1)[0])
                    continue
                new_words.append(i)
        if mode == "k-random":
            num = round(len(word_list) * k)
            indexes = list(range(len(word_list)))
            shuffle(indexes)
            new_words = word_list[:]
            for i in indexes:
                self.synoynms.setdefault(word_list[i], [])
                if (num > 0 and len(self.synoynms[word_list[i]]) > 0):
                    new_words[i] = sample(self.synoynms[word_list[i]], 1)[0]
                    num -= 1
        return " ".join(new_words)

    def create_new_sentence(self, sentence, flags,  shuffle_prob = 1, replace_k = 0.5, remove_k = 0.3):
      if flags[0] == '1':
        sentence = self.remove_words(sentence, k = remove_k)
      if flags[1] == '1':
        sentence = self.replace_words(sentence, k = replace_k)
      if flags[2] == '1':
        sentence = self.shuffle_words(sentence, prob=shuffle_prob)
      return sentence
    
    def create_new_dataset(self, dataset, flags):
      dataset_copy = dataset.copy()
      for i in range(len(dataset['tweet'])):
        dataset_copy['tweet'].iloc[i] = self.create_new_sentence(dataset_copy['tweet'].iloc[i], flags)
      return dataset_copy

In [None]:
mutator = TextMutant()

In [6]:
#test
dataset_test = pd.read_csv('/content/taskC.En.input.csv')
print(len(dataset_test))

200


In [7]:
df = pd.read_csv('/content/cleaned_dataset.csv')[['tweet', 'sarcastic']]
df = df.dropna(subset=['tweet'])
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df2 = pd.read_csv('/content/dataset_m_110.csv')[['tweet', 'sarcastic']]
df2 = df2.dropna(subset=['tweet'])
# df2 = df2.sample(frac=1, random_state=42).reset_index(drop=True)

df = pd.concat([df, df2]).reset_index(drop=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,tweet,sarcastic
0,chloe and toby just mind their business 🤣,0
1,football a ruckus and exactly the above-mentio...,0
2,according to what get hands on kittle allow fa...,0
3,i compass breadth from drop weight build,0
4,how can you big gun directed toward be without...,0
...,...,...
6926,auscultate dylan petit in fact preservationist...,1
6927,be convinced ive price mystery ted cruz consta...,0
6928,hir favored a certain literally hir say it ann...,0
6929,people who fire in belly cover ground limit de...,0


In [8]:
dataset_test

Unnamed: 0,text_0,text_1
0,I see that your team played well today!,I'm sorry that your team didn't win yesterday.
1,"Anthony Taylor is such a fair referee, I wish ...",I hope Anthony Taylor is never put in charge o...
2,"the weather is gloomy, just raining and dull.",What a glorious weather today
3,People going out to get there boosters without...,Nice to see the sheep getting their boosters t...
4,"Really great weather we're having, love a bit ...",Really cold January so far - looking forward t...
...,...,...
195,"the tories betrayed the nation, what a surprise!","the tories betrayed the nation, as expected"
196,Cant believe we have to spend the rest of our ...,Cant wait to spend the rest of my life waiting...
197,Isn't it just amazing how competent the govern...,"Everything is a total mess, how can anyone be ..."
198,Thanks Boris Johnson for restricting travel ab...,The reasoning behind the tightening of travel ...


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['sarcastic'], test_size=0.1, random_state=42)

In [None]:
# mutated = mutator.create_new_dataset(pd.DataFrame(X_train), '111')

In [None]:
# X_train = pd.concat([X_train, mutated['tweet']], axis=0)
# y_train = pd.concat([y_train, y_train], axis=0)

In [9]:
X_train = df['tweet']
y_train = df['sarcastic']
X_test0 = dataset_test['text_0']
X_test1 = dataset_test['text_1']

In [10]:
print(len(X_train))
print(len(y_train))

6931
6931


In [11]:
X_train = X_train.tolist()
X_test0 = X_test0.tolist()
X_test1 = X_test1.tolist()
y_train = y_train.tolist()
# y_test = y_test.tolist()

In [12]:
# model
model_name = 'detecting-sarcasim'
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL,num_labels=2, loss_function_params={"weight": [0.75, 0.25]})

# tokenize
train_encodings = tokenizer(X_train, truncation=True, padding=True,return_tensors = 'pt')

# change to dataset
train_dataset = SarcasimDataset(train_encodings, y_train)

# trainer args
training_args = TrainingArguments(
  output_dir='./res', num_train_epochs=5, per_device_train_batch_size=32, warmup_steps=500, weight_decay=0.01,logging_dir='./logs4'
  )


# model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    compute_metrics = compute_metrics,
  )

trainer.train()

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 6931
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1085


Step,Training Loss
500,0.5854
1000,0.1705


Saving model checkpoint to ./res/checkpoint-500
Configuration saved in ./res/checkpoint-500/config.json
Model weights saved in ./res/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./res/checkpoint-1000
Configuration saved in ./res/checkpoint-1000/config.json
Model weights saved in ./res/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1085, training_loss=0.3521255216290874, metrics={'train_runtime': 478.501, 'train_samples_per_second': 72.424, 'train_steps_per_second': 2.267, 'total_flos': 1406909069956530.0, 'train_loss': 0.3521255216290874, 'epoch': 5.0})

In [13]:
test0_encodings = tokenizer(X_test0, truncation=True, padding=True,return_tensors = 'pt')
test1_encodings = tokenizer(X_test1, truncation=True, padding=True,return_tensors = 'pt')
test_dataset0 = SarcasimDataset(test0_encodings, [1]* 200)
test_dataset1 = SarcasimDataset(test1_encodings, [1]* 200)

In [14]:
preds0 = trainer.predict(test_dataset0)
preds1 = trainer.predict(test_dataset1)

***** Running Prediction *****
  Num examples = 200
  Batch size = 8


***** Running Prediction *****
  Num examples = 200
  Batch size = 8


In [15]:
preds0 = preds0.predictions
preds1 = preds1.predictions

In [19]:
preds0 = preds0[:, 0:2]
preds1 = preds1[:, 0:2]

In [22]:
preds0[0]

array([-0.27095747,  3.9821095 ], dtype=float32)

In [25]:
res = []
for i in range(len(preds0)):
  if preds0[i][1] > preds1[i][1]:
    res.append(0)
  elif preds0[i][1] < preds1[i][1]:
    res.append(1)
  else:
    res.append(2)

In [26]:
f = [0, 0, 0]
for val in res:
  if val == 0:
    f[0] += 1
  elif val == 1:
    f[1] += 1
  else:
    f[2] += 1
print(f)

[109, 91, 0]


In [28]:
f = open('/content/b.txt', 'w')

for val in res:
  f.write(str(val) + "\n")

f.close()