In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import *
import logging
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
logging.basicConfig(level=logging.ERROR)

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# train = pd.read_csv('../input/iremajorprojectdata/train.csv')
# val = pd.read_csv('../input/iremajorprojectdata/val.csv')

data = pd.read_csv('../input/full-context/full_context.csv')
data = data[data['full_context'].notnull()]
print(data.shape)

data.reset_index(drop=True, inplace=True)
train, val = train_test_split(data, test_size=0.1)

train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

In [None]:
train.shape, val.shape

In [None]:
train.head()

In [None]:
val.head()

In [None]:
train['citation_influence_label'].unique(), val['citation_influence_label'].unique()

In [None]:
train.describe()

In [None]:
val.describe()

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
# tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', truncation=True, do_lower_case=True)
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
class CitationData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
#         self.model = model
        self.text = dataframe.citation_context
        self.targets = dataframe.citation_influence_label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
#         out = self.model.encode(text)
#         out = tokenizer(text, padding=True, truncation=True, max_length=512, pad_to_max_length=True, return_tensors='pt')
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
class Preprocessing():
    def __init__(self):
        pass
    
    def remove_brackets(self, text):

        text = re.sub(r'\([^)]*\)', '', text)
        return text
    
    def tokenize_text(self, text):
        
        tokenized_text = word_tokenize(text)
        return tokenized_text
    
    def remove_stopwords(self, text):

        final_text = [word for word in text if word not in stopwords]
        return final_text
    
    def lemmatize(self, text):

        lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
        return lemmatized_text
    
    def preprocess(self, text):

        text = text.replace('#AUTHOR_TAG', ' ')
        final_text = self.remove_brackets(text)
#         tokenized_text = self.tokenize_text(text)
#         filtered_text = self.remove_stopwords(tokenized_text)
#         final_text = self.lemmatize(filtered_text)

        return final_text

In [None]:
# preprocessor = Preprocessing()

# train['citation_context'] = [preprocessor.preprocess(sent) for sent in train['citation_context']]
# val['citation_context'] = [preprocessor.preprocess(sent) for sent in val['citation_context']]

In [None]:
# train_data, test_data = train_test_split(new_df, test_size=0.15)
# train_data = train_data.reset_index(drop=True)
# test_data = test_data.reset_index(drop=True)


# print("FULL Dataset: {}".format(new_df.shape))
# print("Train Dataset: {}".format(train.shape))
# print("Validation Dataset: {}".format(val.shape))

training_set = CitationData(train, tokenizer, MAX_LEN)
testing_set = CitationData(val, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class BertClass(torch.nn.Module):
    def __init__(self):
        super(BertClass, self).__init__()
#         self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
#         self.l1 = RobertaModel.from_pretrained("roberta-base")
#         self.l1 = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
#         self.l1 = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 2)
        self.relu = torch.nn.ReLU()

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         model_output = self.l1(**encoded_input)
#         pooler = self.mean_pooling(model_output, encoded_input['attention_mask'])
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = self.relu(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
#     def mean_pooling(self, model_output, attention_mask):
#         token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#         sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
#         sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
#         return sum_embeddings / sum_mask

In [None]:
model = BertClass()
model.to(device)

In [None]:
model = torch.load('../input/iremodel/pytorch_bert.bin')
model.eval()

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
def train(epoch, training_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
#         outputs = model(data)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%500==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 500 steps: {loss_step}")
            print(f"Training Accuracy per 500 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
EPOCHS = 2
for epoch in range(EPOCHS):
    train(epoch, training_loader)

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [None]:
class CitationData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.citation_context
#         self.targets = self.data.citation_influence_label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
#             'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# test_data_f = pd.read_csv('../input/ire-major-project/SDP_test.csv')
test_data_f = pd.read_csv('../input/iremajorprojectdata/val.csv')

In [None]:
test_data_f.head(2)

In [None]:
# test_data_f['citation_context'] = [preprocessor.preprocess(sent) for sent in test_data_f['citation_context']]

In [None]:
data_to_test = CitationData(test_data_f[['citation_context']], tokenizer, MAX_LEN)

test_params = {'batch_size': 4,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader_f = DataLoader(data_to_test, **test_params)

In [None]:
def test(model, testing_loader):
    res = []
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
#             targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            big_val, big_idx = torch.max(outputs, dim=1)
            res.extend(big_idx.tolist())
    
    return res

In [None]:
res = test(model, testing_loader_f)

In [None]:
test_data_f.head()

In [None]:
test_data_f['predictions'] = res

In [None]:
test_data_f.shape

In [None]:
# test_data_f[test_data_f['citation_influence_label']==test_data_f['predictions']].shape

In [None]:
# avg_length = sum([len(sent.split()) for sent in test_data_f['citation_context']])/450

# avg_length

In [None]:
# correct = test_data_f[test_data_f['citation_influence_label']==test_data_f['predictions']]
# avg_length = sum([len(sent.split()) for sent in correct['citation_context']])/305

# avg_length

In [None]:
# incorrect = test_data_f[test_data_f['citation_influence_label']!=test_data_f['predictions']]
# avg_length = sum([len(sent.split()) for sent in incorrect['citation_context']])/incorrect.shape[0]

# avg_length

In [None]:
# max_incorrect_length = max([len(sent.split()) for sent in incorrect['citation_context']])
# min_incorrect_length = min([len(sent.split()) for sent in incorrect['citation_context']])

# max_correct_length = max([len(sent.split()) for sent in correct['citation_context']])
# min_correct_length = min([len(sent.split()) for sent in correct['citation_context']])

# max_incorrect_length, min_incorrect_length, max_correct_length, min_correct_length

In [None]:
# correct.shape, incorrect.shape

In [None]:
# incorrect.head()

In [None]:
# incorrect.reset_index(inplace=True)
# correct.reset_index(inplace=True)

In [None]:
# incorrect['citation_context'][13], incorrect.citation_influence_label[13], incorrect.predictions[13]

In [None]:
# incorrect.citation_context[8], incorrect.citation_influence_label[8], incorrect.predictions[8]

In [None]:
# incorrect.citation_context[45], incorrect.citation_influence_label[45], incorrect.predictions[45]

In [None]:
# incorrect.citation_context[76], incorrect.citation_influence_label[76], incorrect.predictions[76]

In [None]:
# incorrect.citation_context[28], incorrect.citation_influence_label[28], incorrect.predictions[28]

In [None]:
# incorrect.citation_context[10], incorrect.citation_influence_label[10], incorrect.predictions[10]

In [None]:
# incorrect.citation_context[33], incorrect.citation_influence_label[33], incorrect.predictions[33]

In [None]:
# incorrect.citation_context[85], incorrect.citation_influence_label[85], incorrect.predictions[85]

In [None]:
# correct.citation_context[8], correct.citation_influence_label[8], correct.predictions[8]

In [None]:
# correct.citation_context[65], correct.citation_influence_label[65], correct.predictions[65]

In [None]:
# correct.citation_context[145], correct.citation_influence_label[145], correct.predictions[145]

In [None]:
submission = pd.DataFrame()
submission['unique_id'] = test_data_f['unique_id']
submission['citation_influence_label'] = res

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
# train_data.to_csv('train.csv', index=False)
# test_data.to_csv('val.csv', index=False)

In [None]:
output_model_file = 'pytorch_scibert.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')