In [None]:
# Install required packages

!pip install transformers
# !pip install datasets
# !pip install fairseq
!pip install sentencepiece



In [None]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.utils import shuffle

from numpy.lib.function_base import average

from tqdm.notebook import tqdm

from collections import Counter

import os
import re
import json
import copy
import collections
import time
import pickle

from transformers import BertConfig, BertTokenizer, BertweetTokenizer, RobertaTokenizer, AlbertTokenizer, DistilBertTokenizer, XLMRobertaTokenizer, XLNetTokenizer, T5Tokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, DistilBertForSequenceClassification, RobertaForSequenceClassification, AlbertForSequenceClassification, XLMRobertaForSequenceClassification, XLNetForSequenceClassification, T5Model
from transformers import TrainingArguments
from transformers import Trainer
# from fairseq.models.roberta import XLMRModel

from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class SarcasimDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
## Test Dataset
class SarcasimTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1, "recall": recall, 'precision': precision}

In [None]:
from random import sample, random, shuffle
from math import ceil
import json

class TextMutant():
    def __init__(self):
        synm_filepath = '/content/drive/MyDrive/DataLab/sarcasm/crawler/synm3.json'

        with open(synm_filepath) as json_file:
            self.synoynms = json.load(json_file)
    
    def remove_words(self, sentence, mode = "k-random", k = 0.1, prob = 0.1):
        word_list = sentence.split()
        new_words = []
        if mode == "prob":
            for i in word_list:
                if (random() < prob):
                    continue
                new_words.append(i)
        if mode == "k-random":
            num = round(len(word_list) * k)
            random_index = sample(list(range(len(word_list))), num)
            new_words = list( word_list[i] for i in range(len(word_list)) if i not in random_index )
        return " ".join(new_words)

    def shuffle_words(self, sentence, prob = 0.1):
        word_list = sentence.split()
        indexes = list(range(len(word_list)))
        if (random() < prob):
            shuffle(indexes)
        new_words = list( word_list[i] for i in indexes )
        return " ".join(new_words)
    
    def replace_words(self, sentence, mode = "k-random", k = 0.1, prob = 0.1):
        word_list = sentence.split()
        new_words = []
        if mode == "prob":
            for i in word_list:
                self.synoynms.setdefault(i, [])
                if (random() < prob and len(self.synoynms[i]) > 0):
                    new_words.append(sample(self.synoynms[i], 1)[0])
                    continue
                new_words.append(i)
        if mode == "k-random":
            num = round(len(word_list) * k)
            indexes = list(range(len(word_list)))
            shuffle(indexes)
            new_words = word_list[:]
            for i in indexes:
                self.synoynms.setdefault(word_list[i], [])
                if (num > 0 and len(self.synoynms[word_list[i]]) > 0):
                    new_words[i] = sample(self.synoynms[word_list[i]], 1)[0]
                    num -= 1
        return " ".join(new_words)

    def create_new_sentence(self, sentence, flags,  shuffle_prob = 1, replace_k = 0.5, remove_k = 0.3):
      if flags[0] == '1':
        sentence = self.remove_words(sentence, k = remove_k)
      if flags[1] == '1':
        sentence = self.replace_words(sentence, k = replace_k)
      if flags[2] == '1':
        sentence = self.shuffle_words(sentence, prob=shuffle_prob)
      return sentence
    
    def create_new_dataset(self, dataset, flags):
      dataset_copy = dataset.copy()
      for i in range(len(dataset['tweet'])):
        dataset_copy['tweet'].iloc[i] = self.create_new_sentence(dataset_copy['tweet'].iloc[i], flags)
      return dataset_copy

In [None]:
mutator = TextMutant()

In [None]:
#test
dataset_test = pd.read_csv('/content/drive/MyDrive/DataLab/sarcasm/FinalDataset/task_A_En_test.csv')
print(len(dataset_test))

1400


In [None]:
df = pd.read_csv('/content/drive/MyDrive/DataLab/sarcasm/FinalDataset/train.En.csv')[['tweet', 'sarcastic']]
df = df.dropna(subset=['tweet'])
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df2 = pd.read_csv('/content/drive/MyDrive/DataLab/sarcasm/FinalDataset/data5.csv')[['tweet', 'sarcastic']]
# df2 = df2.rename(columns={"label":"sarcastic"})
df2 = df2.dropna(subset=['tweet'])
# df2 = df2.sample(frac=1, random_state=42).reset_index(drop=True)

df = pd.concat([df, df2]).reset_index(drop=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,tweet,sarcastic
0,was It the,1
1,Their website doesn't seem to mention Axa nor ...,0
2,Imagine not liking football 🤣,1
3,just class Match is play to,0
4,guys one of want know I those You,0
...,...,...
6929,He basically @DanielIngolfur communist. was I ...,1
6930,"a while, multiple in that still once Tatiana #...",0
6931,one! #podcasts next forward fun. Was the,0
6932,at to have taste window think outside the it f...,0


In [None]:
X_train = df['tweet']
y_train = df['sarcastic']
X_test = dataset_test['text']
y_test = dataset_test['sarcastic']

In [None]:
print(len(X_train))
print(len(y_train))

6934
6934


In [None]:
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [None]:
# model
model_name = 'detecting-sarcasim'
task='sentiment'
MODEL = 'xlnet-base-cased'

tokenizer = XLNetTokenizer.from_pretrained(MODEL,num_labels=2, loss_function_params={"weight": [0.75, 0.25]})

# tokenize
train_encodings = tokenizer(X_train, truncation=True, padding=True,return_tensors = 'pt')
# test_encodings = tokenizer(X_test,truncation=True, padding=True,return_tensors = 'pt')

# change to dataset
train_dataset = SarcasimDataset(train_encodings, y_train)
# test_dataset = SarcasimDataset(test_encodings, y_test)

# trainer args
training_args = TrainingArguments(
  output_dir='./res', num_train_epochs=5, per_device_train_batch_size=32, warmup_steps=500, weight_decay=0.01,logging_dir='./logs4'
  )


# model
model = XLNetForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    # eval_dataset=test_dataset,
    compute_metrics = compute_metrics,
  )

trainer.train()

loading file https://huggingface.co/xlnet-base-cased/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/df73bc9f8d13bf2ea4dab95624895e45a550a0f0a825e41fc25440bf367ee3c8.d93497120e3a865e2970f26abdf7bf375896f97fde8b874b70909592a6c785c9
loading file https://huggingface.co/xlnet-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/xlnet-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/xlnet-base-cased/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/xlnet-base-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/46f47734f3dcaef7e236b9a3e887f27814e18836a8db7e6a49148000058a1a54.2a683f915238b4f560dab0c724066cf0a7de9a851e96b0fb3a1e7f0881552f53
loading configuration file https://huggingface.co/xlnet-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/06bdb0f5882db

Step,Training Loss
500,0.5804
1000,0.3453


Saving model checkpoint to ./res/checkpoint-500
Configuration saved in ./res/checkpoint-500/config.json
Model weights saved in ./res/checkpoint-500/pytorch_model.bin
  import sys
Saving model checkpoint to ./res/checkpoint-1000
Configuration saved in ./res/checkpoint-1000/config.json
Model weights saved in ./res/checkpoint-1000/pytorch_model.bin
  import sys


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1085, training_loss=0.43560445231776085, metrics={'train_runtime': 2054.341, 'train_samples_per_second': 16.876, 'train_steps_per_second': 0.528, 'total_flos': 2662106059190160.0, 'train_loss': 0.43560445231776085, 'epoch': 5.0})

In [None]:
test_encodings = tokenizer(X_test, truncation=True, padding=True,return_tensors = 'pt')
test_dataset = SarcasimDataset(test_encodings, y_test)

In [None]:
preds = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1400
  Batch size = 8
  import sys


In [None]:
preds.predictions

array([[ 0.9014605 , -0.5506527 ],
       [ 3.1959937 , -4.0229497 ],
       [ 1.73607   , -1.318895  ],
       ...,
       [ 3.1608338 , -3.446867  ],
       [-0.01974405,  0.5739856 ],
       [ 1.7562745 , -1.2952605 ]], dtype=float32)

In [None]:
preds = np.argmax(preds.predictions[:, 0:2], axis=-1)
f = [0, 0, 0]
for val in preds:
  if val == 0:
    f[0] += 1
  elif val == 1:
    f[1] += 1
  else:
    f[2] += 1
print(f)

[1052, 348, 0]


In [None]:
f = open('/content/drive/MyDrive/DataLab/sarcasm/FinalDataset/result/xlnet-rs-data5', 'w')


for pred in preds:
  f.write(str(pred) + "\n")

f.close()




In [None]:
from sklearn.metrics import f1_score

# f = open('/content/drive/MyDrive/DataLab/sarcasm/FinalDataset/result/data4_res', 'r')
# content = f.read()
# predict_label = content.split("\n")
actual_label = pd.read_csv('/content/drive/MyDrive/DataLab/sarcasm/FinalDataset/task_A_En_test.csv')['sarcastic'].tolist()

In [None]:
f1_score(actual_label,preds)

0.26277372262773724