In [None]:
!pip install transformers
import os
import random
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
from google.colab import drive
import textwrap
import progressbar
import keras
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import json

In [None]:
drive.mount("/content/Drive/")

In [None]:
df = pd.read_csv('/content/Drive/My Drive/LNLP/dataset.csv') # path to multi dataset
train_set = df.query(" split=='train' ")
test_set = df.query(" split=='test' ")
validation_set = df.query(" split=='dev' ")

In [None]:
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}

model_type = 'xlnet' ###--> CHANGE WHAT MODEL YOU WANT HERE!!! <--###
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
model_name = 'xlnet-base-cased'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
output_dir = "/content/Drive/My Drive/mini_XLNet/"
tokenizer = XLNetTokenizer.from_pretrained(output_dir)
model = XLNetForSequenceClassification.from_pretrained(output_dir, output_hidden_states=True)
model.to(device)

In [None]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

def grouped_input_ids(all_toks):
  splitted_toks = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    l+=410
    r+=410

  CLS = tokenizer.cls_token
  SEP = tokenizer.sep_token
  e_sents = []
  for l_t in splitted_toks:
    l_t = l_t + [SEP] + [CLS]
    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)
    e_sents.append(encoded_sent)

  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="pre")
  att_masks = att_masking(e_sents)
  return e_sents, att_masks

def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      logits, encoded_layers = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

  vec = torch.cat((encoded_layers[12][0][-1], encoded_layers[11][0][-1], encoded_layers[10][0][-1], encoded_layers[9][0][-1]), dim=0)
  vec = vec.detach().cpu().numpy()
  return vec

def generate_np_files_for_emb(dataf, tokenizer):
  all_docs = []
  for i in progressbar.progressbar(range(len(dataf['text']))):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text, add_prefix_space=True)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

    vecs = []
    for index,ii in enumerate(splitted_input_ids):
      vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))
 
    one_doc = np.asarray(vecs)
    all_docs.append(one_doc)
  

  all_docs = np.asarray(all_docs)
  return all_docs

In [None]:
vecs_dev = generate_np_files_for_emb(validation_set, tokenizer)
np.save("/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_dev.npy", vecs_dev)

In [None]:
train_set_1 = train_set.iloc[:10000]
train_set_2 = train_set.iloc[10000:20000]
train_set_3 = train_set.iloc[20000:]

In [None]:
vecs_test = generate_np_files_for_emb(test_set, tokenizer)
np.save("/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_test.npy", vecs_test)

In [None]:
vecs_train_1 = generate_np_files_for_emb(train_set_1, tokenizer)
np.save("/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_train_1.npy", vecs_train_1)

In [None]:
vecs_train_2 = generate_np_files_for_emb(train_set_2, tokenizer)
np.save("/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_train_2.npy", vecs_train_2)

In [None]:
vecs_train_3 = generate_np_files_for_emb(train_set_3, tokenizer)
np.save("/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_train_3.npy", vecs_train_3)