In [1]:
import string
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from collections import Counter
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import re

#### Preprocessing

In [2]:
def contextual_preprocess(tweet):
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', tweet)
    tweet = re.sub(r'[\u064B-\u0652]', '', tweet)
    tweet = re.sub(r'[^\u0621-\u064A\u0660-\u0669 ]+', ' ', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    return tweet

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def contextual_embeddings(tweets):
  base_model_name = 'moha/arabert_c19'
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
  base_model = AutoModel.from_pretrained(base_model_name).to(device)
  base_model.eval()
  for param in base_model.parameters():
    param.requires_grad = False

  tokens = [tokenizer(tweet,
                padding='max_length',
                max_length=512,
                truncation=True,
                return_tensors="pt") for tweet in tweets
  ]

  embeddings = [(base_model(input_ids=token['input_ids'].to(device),
                            attention_mask=token['attention_mask'].to(device),
                            return_dict=False)[1]).detach().cpu()
                for token in tqdm(tokens)]

  return embeddings

#### Apply the Contextual Embeddings

In [6]:
DIR = lambda x : f'../../Dataset/SavedFeatures/CONTEXT/{x}.npy'
D_DIR = lambda x : f'../../Dataset/{x}.npy'

Preprocessing = {"approach": "contextual"}
Features = {"approach": "contextual"}

y1 = np.load(D_DIR('y1'), allow_pickle=True)
y2 = np.load(D_DIR('y2'), allow_pickle=True)
Y1 = np.load(D_DIR('Y1_test'), allow_pickle=True)
Y2 = np.load(D_DIR('Y2_test'), allow_pickle=True)

saved = False
if saved:
    x = np.load(DIR('x'), allow_pickle=True)
    X = np.load(DIR('X_test'), allow_pickle=True)
    
else:
   x = pd.read_csv('../../Dataset/train.csv')
   X = pd.read_csv('../../Dataset/dev.csv')
   x = [contextual_preprocess(tweet) for tweet in x['text']]
   X = [contextual_preprocess(tweet) for tweet in X['text']]
   x = contextual_embeddings(x)
   X = contextual_embeddings(X)
   

Downloading tokenizer_config.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/743k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/516M [00:00<?, ?B/s]

Some weights of the model checkpoint at moha/arabert_c19 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at moha/arabert_c19 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You shoul

KeyboardInterrupt: 