In [1]:
import pandas as pd
import numpy as np
import torch
import demoji
import re
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report

from torch.utils.data import Dataset
import torch.nn.functional as F

from transformers import pipeline 
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import BertTokenizer,BertForSequenceClassification
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

### Text Preprocessing Befor tokenization 

In [2]:
# 0.54
import string

def clean_tags_links(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 2 else t
        # t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
def pre_process(text):
    text = text.replace('\\n', ' ').replace('\n', ' ')
    text = demoji.replace_with_desc(text) 
    
    link_regex = re.compile(r'https?://[^\s]+')
    text = link_regex.sub(':http:', text)
    text = re.sub(r'\s+', r' ', text)
    return text
def clean_text_data(text_data):
    for i, text in enumerate(text_data):
        try:
            text_data[i] = pre_process(text)
        except:
            print(text_data[i])
    return text_data

### Loading the model

In [3]:
model_name = 'best_model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Load and evaluate the test dataset

In [4]:
test_text = ["Apple magic not working today 🙄", "Wow the realization that I now never have"]
df_test = pd.read_csv('./test.En.csv')
test_text = df_test['text'].to_list()
test_labels = df_test['sarcastic'].to_list()

In [5]:

test_text = clean_text_data(test_text)
batch = tokenizer(test_text, padding=True, truncation=True, return_tensors="pt") # if you don't use return_tensors then the batch will be a normal list and not a tensor

In [6]:
with torch.no_grad():
    outputs = model(**batch) # use this only for pytorch in order to unpack the batch dictionary
    # print(outputs)
    predictions = F.softmax(outputs.logits, dim=1)
    # print(predictions)
    labels = torch.argmax(predictions, dim=1)
    class_report = classification_report(test_labels, labels.tolist())
    print(class_report)
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    # print(labels)

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      1200
           1       0.48      0.60      0.54       200

    accuracy                           0.85      1400
   macro avg       0.71      0.75      0.72      1400
weighted avg       0.87      0.85      0.86      1400

