In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import *
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
train = pd.read_csv('../input/smai-project-data/train_data.csv')
val = pd.read_csv('../input/smai-project-data/val_data.csv')


In [4]:
train['merged'] = train['question1'] + " " + train['question2']
val['merged'] = val['question1'] + " " + val['question2']


In [5]:
train.shape, val.shape

((283003, 9), (80858, 9))

In [6]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_preprocessed,question2_preprocessed,merged
0,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,how do i play pok mon go in korea ?,how do i play pok mon go in china ?,How do I play Pokémon GO in Korea? How do I pl...
1,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,What are some of the best side dishes for crab...
2,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,which is more advisable and better material fo...,what is the best server setup for buddypress ?,Which is more advisable and better material fo...
3,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,how do i improve logical programming skills ?,how can i improve my logical skills for progra...,How do I improve logical programming skills? H...
4,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,how close we are to see 3rd world war ?,how close is a world war iii ?,How close we are to see 3rd world war? How clo...


In [7]:
val.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_preprocessed,question2_preprocessed,merged
0,81995,139138,139139,When were emojis created? How did they become ...,Who created the emojis and when where they cre...,1,when were emojis created ? how did they become...,who created the emojis and when where they cre...,When were emojis created? How did they become ...
1,289624,233615,68923,How much time did you take to learn guitar?,How much time it will take for the beginners t...,1,how much time did you take to learn guitar ?,how much time it will take for the beginners t...,How much time did you take to learn guitar? Ho...
2,66887,62546,115846,How long does it take to learn how to develop ...,How much time does it take to learn and do And...,0,how long does it take to learn how to develop ...,how much time does it take to learn and do and...,How long does it take to learn how to develop ...
3,236648,347622,347623,What is the kinetic energy of a 150kg object t...,Is kinetic energy gained when it is moving at ...,0,what is the kinetic energy of a 150kg object t...,is kinetic energy gained when it is moving at ...,What is the kinetic energy of a 150kg object t...
4,289076,410137,410138,What causes this reproducible optical illusion?,What causes an optical illusion?,0,what causes this reproducible optical illusion ?,what causes an optical illusion ?,What causes this reproducible optical illusion...


In [8]:
train['is_duplicate'].unique(), val['is_duplicate'].unique()

(array([0, 1]), array([1, 0]))

In [9]:
train.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,283003.0,283003.0,283003.0,283003.0
mean,202192.331979,217054.933467,220872.439928,0.369667
std,116731.031422,157731.997689,160025.552542,0.482715
min,0.0,1.0,2.0,0.0
25%,101051.5,74268.0,74497.0,0.0
50%,202121.0,191808.0,196616.0,0.0
75%,303346.5,346170.0,354658.0,1.0
max,404289.0,537932.0,537933.0,1.0


In [10]:
val.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,80858.0,80858.0,80858.0,80858.0
mean,202180.180131,218050.184744,221341.987831,0.366272
std,116628.431722,157931.573085,159683.076997,0.481788
min,3.0,4.0,8.0,0.0
25%,101138.0,75069.75,75543.25,0.0
50%,202441.0,193820.5,198467.5,0.0
75%,303186.75,348423.5,355237.75,1.0
max,404283.0,537924.0,537925.0,1.0


In [11]:
new_df = train[['merged', 'is_duplicate']]

In [12]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
class CitationData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = dataframe.merged
        self.targets = dataframe.is_duplicate
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [14]:
# train_data, test_data = train_test_split(new_df, test_size=0.15)
# train_data = train_data.reset_index(drop=True)
# test_data = test_data.reset_index(drop=True)


# print("FULL Dataset: {}".format(new_df.shape))
print("Train Dataset: {}".format(train.shape))
print("Validation Dataset: {}".format(val.shape))

training_set = CitationData(train, tokenizer, MAX_LEN)
testing_set = CitationData(val, tokenizer, MAX_LEN)

Train Dataset: (283003, 9)
Validation Dataset: (80858, 9)


In [15]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [16]:
class BertClass(torch.nn.Module):
    def __init__(self):
        super(BertClass, self).__init__()
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 2)
        self.relu = torch.nn.ReLU()

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = self.relu(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [17]:
model = BertClass()
model.to(device)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [18]:
model = torch.load('../input/smai-transformers-model/pytorch_bert.bin')

In [19]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [20]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [21]:
def train(epoch, training_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%500==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 500 steps: {loss_step}")
            print(f"Training Accuracy per 500 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [22]:
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch, training_loader)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Loss per 500 steps: 0.10669399797916412
Training Accuracy per 500 steps: 100.0


500it [03:48,  2.21it/s]

Training Loss per 500 steps: 0.2138806384567082
Training Accuracy per 500 steps: 90.7435129740519


1000it [07:36,  2.20it/s]

Training Loss per 500 steps: 0.20612926865959508
Training Accuracy per 500 steps: 91.15884115884116


1500it [11:24,  2.19it/s]

Training Loss per 500 steps: 0.21079475811274995
Training Accuracy per 500 steps: 91.1975349766822


2000it [15:12,  2.19it/s]

Training Loss per 500 steps: 0.21112901068180623
Training Accuracy per 500 steps: 91.16691654172914


2500it [19:01,  2.19it/s]

Training Loss per 500 steps: 0.2095717782005634
Training Accuracy per 500 steps: 91.27848860455818


3000it [22:49,  2.20it/s]

Training Loss per 500 steps: 0.20987160926803752
Training Accuracy per 500 steps: 91.30706431189603


3500it [26:37,  2.21it/s]

Training Loss per 500 steps: 0.2090726416859357
Training Accuracy per 500 steps: 91.36675235646958


4000it [30:26,  2.19it/s]

Training Loss per 500 steps: 0.2099885510541026
Training Accuracy per 500 steps: 91.28655336165959


4500it [34:15,  2.21it/s]

Training Loss per 500 steps: 0.20988168975188223
Training Accuracy per 500 steps: 91.31304154632303


5000it [38:03,  2.19it/s]

Training Loss per 500 steps: 0.2094663741188379
Training Accuracy per 500 steps: 91.32673465306938


5500it [41:53,  2.19it/s]

Training Loss per 500 steps: 0.20904572074060457
Training Accuracy per 500 steps: 91.34702781312488


6000it [45:41,  2.19it/s]

Training Loss per 500 steps: 0.20862015515470117
Training Accuracy per 500 steps: 91.38893517747042


6500it [49:30,  2.18it/s]

Training Loss per 500 steps: 0.2097169401766293
Training Accuracy per 500 steps: 91.34556222119674


7000it [53:19,  2.20it/s]

Training Loss per 500 steps: 0.2095202169827604
Training Accuracy per 500 steps: 91.37444650764176


7500it [57:08,  2.21it/s]

Training Loss per 500 steps: 0.20911685214928122
Training Accuracy per 500 steps: 91.38114918010932


8000it [1:00:57,  2.20it/s]

Training Loss per 500 steps: 0.20880661313455925
Training Accuracy per 500 steps: 91.42450943632046


8500it [1:04:46,  2.17it/s]

Training Loss per 500 steps: 0.20942846214153585
Training Accuracy per 500 steps: 91.40542289142454


9000it [1:08:35,  2.19it/s]

Training Loss per 500 steps: 0.20968630131255664
Training Accuracy per 500 steps: 91.4037329185646


9500it [1:12:24,  2.17it/s]

Training Loss per 500 steps: 0.20998020375577123
Training Accuracy per 500 steps: 91.37196084622671


10000it [1:16:13,  2.15it/s]

Training Loss per 500 steps: 0.2098003729179066
Training Accuracy per 500 steps: 91.38211178882112


10500it [1:20:02,  2.20it/s]

Training Loss per 500 steps: 0.21005395278037287
Training Accuracy per 500 steps: 91.36034663365393


11000it [1:23:52,  2.19it/s]

Training Loss per 500 steps: 0.2101711785868214
Training Accuracy per 500 steps: 91.34965003181529


11500it [1:27:41,  2.20it/s]

Training Loss per 500 steps: 0.209724743501263
Training Accuracy per 500 steps: 91.37140248674028


12000it [1:31:31,  2.20it/s]

Training Loss per 500 steps: 0.2097267963540714
Training Accuracy per 500 steps: 91.37363553037247


12500it [1:35:21,  2.20it/s]

Training Loss per 500 steps: 0.209525374819822
Training Accuracy per 500 steps: 91.38368930485561


13000it [1:39:10,  2.19it/s]

Training Loss per 500 steps: 0.20957929986224444
Training Accuracy per 500 steps: 91.37566341050689


13500it [1:42:59,  2.19it/s]

Training Loss per 500 steps: 0.20948862192387765
Training Accuracy per 500 steps: 91.38304570031849


14000it [1:46:49,  2.18it/s]

Training Loss per 500 steps: 0.20944497288565442
Training Accuracy per 500 steps: 91.37472323405471


14500it [1:50:38,  2.19it/s]

Training Loss per 500 steps: 0.20965499010553332
Training Accuracy per 500 steps: 91.36783670091718


15000it [1:54:28,  2.19it/s]

Training Loss per 500 steps: 0.2098587673063916
Training Accuracy per 500 steps: 91.35724285047664


15500it [1:58:17,  2.20it/s]

Training Loss per 500 steps: 0.209897418333285
Training Accuracy per 500 steps: 91.34975162892717


16000it [2:02:06,  2.08it/s]

Training Loss per 500 steps: 0.20990831298658033
Training Accuracy per 500 steps: 91.34350978063871


16500it [2:05:56,  2.18it/s]

Training Loss per 500 steps: 0.20944280907769064
Training Accuracy per 500 steps: 91.3785528149809


17000it [2:09:45,  2.20it/s]

Training Loss per 500 steps: 0.20952115929583848
Training Accuracy per 500 steps: 91.3637433092171


17500it [2:13:34,  2.21it/s]

Training Loss per 500 steps: 0.20976182615091069
Training Accuracy per 500 steps: 91.34120907376722


18000it [2:17:23,  2.20it/s]

Training Loss per 500 steps: 0.20970479193322802
Training Accuracy per 500 steps: 91.34631409366146


18500it [2:21:12,  2.19it/s]

Training Loss per 500 steps: 0.20987331009117038
Training Accuracy per 500 steps: 91.33154964596508


19000it [2:25:01,  2.18it/s]

Training Loss per 500 steps: 0.21002231665819876
Training Accuracy per 500 steps: 91.32085153412979


19500it [2:28:50,  2.20it/s]

Training Loss per 500 steps: 0.20989025598308972
Training Accuracy per 500 steps: 91.31582995743808


20000it [2:32:39,  2.15it/s]

Training Loss per 500 steps: 0.21003102635870913
Training Accuracy per 500 steps: 91.31668416579171


20500it [2:36:28,  2.21it/s]

Training Loss per 500 steps: 0.21013037108117782
Training Accuracy per 500 steps: 91.3181064338325


21000it [2:40:17,  2.19it/s]

Training Loss per 500 steps: 0.21012060915856354
Training Accuracy per 500 steps: 91.31231846102567


21500it [2:44:06,  2.20it/s]

Training Loss per 500 steps: 0.2101696596848431
Training Accuracy per 500 steps: 91.3108692618948


22000it [2:47:56,  2.18it/s]

Training Loss per 500 steps: 0.21021251747313752
Training Accuracy per 500 steps: 91.30494068451434


22500it [2:51:45,  2.21it/s]

Training Loss per 500 steps: 0.2103165521800552
Training Accuracy per 500 steps: 91.29483134082929


23000it [2:55:34,  2.19it/s]

Training Loss per 500 steps: 0.21006388042413932
Training Accuracy per 500 steps: 91.30146515368897


23500it [2:59:23,  2.14it/s]

Training Loss per 500 steps: 0.21015018185201306
Training Accuracy per 500 steps: 91.29771073571338


24000it [3:03:13,  2.20it/s]

Training Loss per 500 steps: 0.21002036252022765
Training Accuracy per 500 steps: 91.30348735469356


24500it [3:07:02,  2.19it/s]

Training Loss per 500 steps: 0.2101973878707434
Training Accuracy per 500 steps: 91.29270233868006


25000it [3:10:51,  2.19it/s]

Training Loss per 500 steps: 0.20994437440648275
Training Accuracy per 500 steps: 91.29434822607095


25500it [3:14:41,  2.14it/s]

Training Loss per 500 steps: 0.20990679048905939
Training Accuracy per 500 steps: 91.29494921767774


26000it [3:18:30,  2.20it/s]

Training Loss per 500 steps: 0.2102799113601153
Training Accuracy per 500 steps: 91.27533556401677


26500it [3:22:19,  2.21it/s]

Training Loss per 500 steps: 0.2100142878275691
Training Accuracy per 500 steps: 91.2937247651032


27000it [3:26:08,  2.21it/s]

Training Loss per 500 steps: 0.2101141394124558
Training Accuracy per 500 steps: 91.29198918558572


27500it [3:29:58,  2.20it/s]

Training Loss per 500 steps: 0.21029878793458123
Training Accuracy per 500 steps: 91.2730446165594


28000it [3:33:47,  2.12it/s]

Training Loss per 500 steps: 0.21017119547650503
Training Accuracy per 500 steps: 91.27531159601443


28500it [3:37:37,  2.16it/s]

Training Loss per 500 steps: 0.20993812025643696
Training Accuracy per 500 steps: 91.28276200835059


29000it [3:41:27,  2.14it/s]

Training Loss per 500 steps: 0.20975454311434527
Training Accuracy per 500 steps: 91.2959897934554


29500it [3:45:18,  2.20it/s]

Training Loss per 500 steps: 0.20959930848363545
Training Accuracy per 500 steps: 91.30029490525745


30000it [3:49:08,  2.19it/s]

Training Loss per 500 steps: 0.20974930272889164
Training Accuracy per 500 steps: 91.29404019866004


30500it [3:52:58,  2.19it/s]

Training Loss per 500 steps: 0.20983591964084375
Training Accuracy per 500 steps: 91.29167896134553


31000it [3:56:48,  2.19it/s]

Training Loss per 500 steps: 0.20974113610266068
Training Accuracy per 500 steps: 91.29302280571595


31500it [4:00:39,  2.10it/s]

Training Loss per 500 steps: 0.2098412559222832
Training Accuracy per 500 steps: 91.29392717691502


32000it [4:04:30,  2.15it/s]

Training Loss per 500 steps: 0.20989459731600288
Training Accuracy per 500 steps: 91.29128777225712


32500it [4:08:20,  2.20it/s]

Training Loss per 500 steps: 0.20988799114738502
Training Accuracy per 500 steps: 91.2864219562475


33000it [4:12:10,  2.19it/s]

Training Loss per 500 steps: 0.20981951119604497
Training Accuracy per 500 steps: 91.29193054755916


33500it [4:16:01,  2.19it/s]

Training Loss per 500 steps: 0.20984882773914285
Training Accuracy per 500 steps: 91.29242410674308


34000it [4:19:51,  2.19it/s]

Training Loss per 500 steps: 0.20977131838001759
Training Accuracy per 500 steps: 91.29731478485927


34500it [4:23:41,  2.13it/s]

Training Loss per 500 steps: 0.2096695534582203
Training Accuracy per 500 steps: 91.30278832497609


35000it [4:27:31,  2.18it/s]

Training Loss per 500 steps: 0.20973490815087512
Training Accuracy per 500 steps: 91.30274849290021


35376it [4:30:24,  2.18it/s]

The Total Accuracy for Epoch 0: 91.30044557831542
Training Loss Epoch: 0.20980390078058372
Training Accuracy Epoch: 91.30044557831542





In [23]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [24]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

2it [00:00, 11.32it/s]

Validation Loss per 100 steps: 0.03703940659761429
Validation Accuracy per 100 steps: 100.0


5002it [06:48, 12.09it/s]

Validation Loss per 100 steps: 0.2432929694773432
Validation Accuracy per 100 steps: 89.65706858628275


10002it [13:37, 11.80it/s]

Validation Loss per 100 steps: 0.24237487163936092
Validation Accuracy per 100 steps: 89.66353364663533


15003it [20:25, 11.88it/s]

Validation Loss per 100 steps: 0.241392637118825
Validation Accuracy per 100 steps: 89.75901606559563


20003it [27:14, 12.31it/s]

Validation Loss per 100 steps: 0.23962147290163993
Validation Accuracy per 100 steps: 89.82300884955752


20215it [27:31, 12.24it/s]

Validation Loss Epoch: 0.23983465061651796
Validation Accuracy Epoch: 89.80929530782359
Accuracy on test data = 89.81%





In [25]:
class CitationData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.merged
        self.targets = self.data.is_duplicate
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [26]:
test_data_f = pd.read_csv('../input/smai-project-data/test_data.csv')
test_data_f['merged'] = test_data_f['question1'] + " " + test_data_f['question2']

In [27]:
test_data_f.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_preprocessed,question2_preprocessed,merged
0,204673,93885,307635,"If there is a God, where is He!","Why is god a ""He""?",0,"if there is a god , where is he !",why is god a `` he '' ?,"If there is a God, where is He! Why is god a ""..."
1,17716,2093,15628,Do you believe that everything happens for a r...,Does everything happen for a reason?,1,do you believe that everything happens for a r...,does everything happen for a reason ?,Do you believe that everything happens for a r...


In [28]:
data_to_test = CitationData(test_data_f[['merged', 'is_duplicate']], tokenizer, MAX_LEN)

test_params = {'batch_size': 8,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader_f = DataLoader(data_to_test, **test_params)

In [29]:
def test(model, testing_loader):
    res = []
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
#             targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            big_val, big_idx = torch.max(outputs, dim=1)
            res.extend(big_idx.tolist())
    
    return res

In [30]:
res = test(model, testing_loader_f)

5054it [13:22,  6.30it/s]


In [31]:
acc = valid(model, testing_loader_f)
print("Accuracy on test data = %0.2f%%" % acc)

2it [00:00,  5.91it/s]

Validation Loss per 100 steps: 0.055543020367622375
Validation Accuracy per 100 steps: 100.0


5002it [13:15,  6.33it/s]

Validation Loss per 100 steps: 0.2407352560750285
Validation Accuracy per 100 steps: 89.67956408718257


5054it [13:23,  6.29it/s]

Validation Loss Epoch: 0.24062864086756555
Validation Accuracy Epoch: 89.68562170719038
Accuracy on test data = 89.69%





In [32]:
# submission = pd.DataFrame()
# submission['unique_id'] = test_data_f['unique_id']
# submission['citation_influence_label'] = res

In [33]:
# submission.to_csv('submission.csv', index=False)

In [34]:
# train_data.to_csv('train.csv', index=False)
# test_data.to_csv('val.csv', index=False)

In [35]:
output_model_file = 'pytorch_bert.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved
