# Load imports

In [None]:
!pip install pytorch-transformers transformers

In [None]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from pytorch_transformers import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

# Load fine-tuned model & Data

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
model_new = XLNetForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/config/XLnet_w_142000.h5")

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv(
    "/content/drive/MyDrive/all_dev3.csv",
    sep="\t", header=None)
data = data.to_numpy()
unqiue_ids = np.unique(data[:, 0])

In [None]:
# check wich queries already are ranked
from pathlib import Path
pathlist = Path("/content/drive/MyDrive/Result_XLNET_dev_142000").rglob('*.csv')
done_list = []
for path in pathlist:
  done_list.append(int(str(path).split('/')[5].split(".")[0]))
a = set(unqiue_ids)
b = set(done_list)
unqiue_ids = list(a-b)
unqiue_ids.sort()
print(unqiue_ids)
print(len(unqiue_ids))

[2, 1215, 1288, 1576, 2235, 2798, 2962, 4696, 4947, 5925, 6217, 6791, 7968, 8701, 8714, 8798, 8854, 9083, 9454, 9926, 10157, 10205, 10264, 10276, 10312, 11006, 11050, 11133, 11913, 12741, 12903, 13397, 14151, 14947, 14963, 15039, 15063, 15382, 15441, 15607, 16559, 16860, 17110, 17430, 17586, 17635, 17848, 17884, 18101, 18759, 18840, 19457, 19552, 19940, 20356, 20432, 20520, 20597, 20671, 21185, 21603, 21741, 21765, 21793, 21861, 21948, 22231, 22479, 22670, 22882, 23223, 23285, 24115, 24441, 24807, 24979, 25025, 25036, 25294, 25344, 25534, 25603, 26079, 26207, 26334, 26485, 26664, 26847, 27618, 27743, 27932, 28216, 28352, 28442, 29089, 29097, 29169, 29416, 29612, 29921, 30039, 30188, 30860, 30956, 31222, 31432, 31595, 32176, 32642, 34015, 34039, 35996, 36025, 36133, 36214, 36388, 36473, 36703, 36965, 37685, 37952, 38098, 38608, 38946, 39360, 39577, 39908, 40056, 40337, 41969, 42361, 42555, 42568, 43649, 43781, 44072, 44686, 45125, 45590, 45757, 45924, 46040, 46081, 46095, 46579, 46711, 

# Creates Input

In [None]:
SEQ_LEN = 200
def change_data_pred(df):
  MAX_LEN = 200
  train_x_queries, train_x_passages = load_data_pred(df)

  tokenized_queries = [tokenizer.tokenize(row) for row in train_x_queries]
  tokenized_passages = [tokenizer.tokenize(row) for row in train_x_passages]

  # Create attention masks
  segment_masks = []
  tokenized_texts = []
  for tokenized_query, tokenized_passage in zip(tokenized_queries, tokenized_passages):
    if MAX_LEN > len(tokenized_queries + tokenized_passages):
      segment_masks.append([0]*len(tokenized_query) + [1]*len(tokenized_passage) + [0]*(SEQ_LEN-len(tokenized_query)-len(tokenized_passage)))
    else:
      segment_masks.append([0]*len(tokenized_query) + [1]*(200-len(tokenized_query)))
    tokenized_texts.append(tokenized_query+ tokenized_passage)

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
  # Create attention masks
  attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

  prediction_inputs = torch.tensor(input_ids)
  prediction_masks = torch.tensor(attention_masks)
  prediction_segments = torch.tensor(segment_masks)
  batch_size = 1
  prediction_data = TensorDataset(prediction_inputs, prediction_segments, prediction_masks)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)    
  return prediction_dataloader

In [None]:
def load_data_pred(pandas_dataframe):
  global tokenizer
  data_df = pandas_dataframe
  data_df["query"] = data_df["query"].astype(str)     # String
  data_df["passage"] = data_df["passage"].astype(str)
  queries = data_df["query"].to_list()
  passages = data_df["passage"].to_list()
  queries = [query + " [SEP] " for query in queries]
  passages = [passage + " [SEP] [CLS]" for passage in passages]
  return queries, passages  

In [None]:
def take_third(elem):
    return elem[2]

# Creates Ranking

In [None]:
import datetime
import torch.nn.functional as F
import csv
model_new.eval()
model_new.to(device)
for id in unqiue_ids[:105]:
    a = datetime.datetime.now()
    matrix = []
    data_ma = data[data[:, 0] == id]
    # print(len(data_ma))
    df = pd.DataFrame(data_ma, columns = ['q_id','p_id','query', 'passage'])
    prediction_dataloader = change_data_pred(df)
    # print(len(prediction_dataloader))
    # Predict 
    predictions = []
    for batch in prediction_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_seg_mask, b_input_mask = batch
      # Telling the model not to compute or store gradients, saving memory and speeding up prediction
      with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model_new(b_input_ids, token_type_ids=b_seg_mask, attention_mask=b_input_mask)
        logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      #print("logits: ",len(logits))
      predictions.append(logits)
      #print("preds-here ", len(predictions))
    #print("preds",len(predictions))
    for i in range(0, len(data_ma)):
        matrix.append([data_ma[i][0], data_ma[i][1], np.float32((np.exp(predictions[i]) / (np.exp(predictions[i])).sum())[0][0])])
    matrix = sorted(matrix, key=take_third, reverse=True)
    #matrix = pd.DataFrame(matrix,columns=['q_id','p_id','score'])
    with open("/content/drive/MyDrive/Result_XLNET_dev_142000/" + str(id) + ".csv", 'w', newline='') as file:
        mywriter = csv.writer(file, delimiter=',')
        mywriter.writerows(matrix)
    print("ID: ", id, " Time: ", datetime.datetime.now() - a)

ID:  2  Time:  0:00:23.568243
ID:  1215  Time:  0:00:21.360681
ID:  1288  Time:  0:00:23.263053
ID:  1576  Time:  0:00:21.080295
ID:  2235  Time:  0:00:23.437359
ID:  2798  Time:  0:00:21.165129
ID:  2962  Time:  0:00:23.617015
ID:  4696  Time:  0:00:21.430287
ID:  4947  Time:  0:00:23.561391
ID:  5925  Time:  0:00:21.309607
ID:  6217  Time:  0:00:23.492304
ID:  6791  Time:  0:00:21.230791
ID:  7968  Time:  0:00:23.362040
ID:  8701  Time:  0:00:21.386571
ID:  8714  Time:  0:00:24.293412
ID:  8798  Time:  0:00:21.386561
ID:  8854  Time:  0:00:23.753273
ID:  9083  Time:  0:00:21.175363
ID:  9454  Time:  0:00:23.533185
ID:  9926  Time:  0:00:21.045585
ID:  10157  Time:  0:00:23.482944
ID:  10205  Time:  0:00:20.969958
ID:  10264  Time:  0:00:23.315260
ID:  10276  Time:  0:00:21.114624
ID:  10312  Time:  0:00:23.210323
ID:  11006  Time:  0:00:21.268262
ID:  11050  Time:  0:00:23.218066
ID:  11133  Time:  0:00:21.332393
ID:  11913  Time:  0:00:23.273484
ID:  12741  Time:  0:00:21.039873
ID:

KeyboardInterrupt: ignored