In [2]:
import string
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from collections import Counter
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import re
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def contextual_preprocess(tweet):
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', tweet)
    tweet = re.sub(r'[\u064B-\u0652]', '', tweet)
    tweet = re.sub(r'[^\u0621-\u064A\u0660-\u0669 ]+', ' ', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    return tweet

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def contextual_embeddings(tweets):
  base_model_name = 'moha/arabert_c19'
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
  base_model = AutoModel.from_pretrained(base_model_name).to(device)
  base_model.eval()
  for param in base_model.parameters():
    param.requires_grad = False

  tokens = [
      tokenizer(tweet,
                padding='max_length',
                max_length=512,
                truncation=True,
                return_tensors="pt") for tweet in tweets
  ]

  embeddings = [(base_model(input_ids=token['input_ids'].to(device),
                            attention_mask=token['attention_mask'].to(device),
                            return_dict=False)[1]).detach().cpu()
                for token in tqdm(tokens)]

  return embeddings

In [6]:
train_data = pd.read_csv('../../Dataset/train.csv')
valid_data = pd.read_csv('../../Dataset/dev.csv')

In [None]:
processed_train_tweets = [contextual_preprocess(tweet) for tweet in train_data['text']]
processed_valid_tweets = [contextual_preprocess(tweet) for tweet in valid_data['text']]

train_contextual_embeddings = contextual_embeddings(processed_train_tweets)
valid_contextual_embeddings = contextual_embeddings(processed_valid_tweets)

In [None]:
labels_list_2 = train_data.category.unique()
labels_2 = {k: v for v, k in enumerate(labels_list_2)}
train_labels_2 = [labels_2[label] for label in train_data['category']]
valid_labels_2 = [labels_2[label] for label in valid_data['category']]

In [None]:
labels_list_1 = train_data.stance.unique()
labels_1 = {k: v for v, k in enumerate(labels_list_1)}
train_labels_1 = [labels_1[label] for label in train_data['stance']]
valid_labels_1 = [labels_1[label] for label in valid_data['stance']]