### importing libraries

In [None]:

from torch.utils.data import Dataset

import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
# from transformers import DistilBertTokenizer, DistilBertModel,DistilBertForSequenceClassification
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report


### defining a custom dataset which returns id, is_fake, mask, and text for each item in the train set and returns id, mask, and text for each item in the test set.
### The tokenizer to use was bert_base_uncased and each of the sentences is truncated to 512 if its length exceeds 512 and is padded if its length is below 512 since 512

In [None]:
class FeatureDataset(Dataset):
    def __init__(self, file_name, is_train):
        self.is_train = is_train
        
        file_out = pd.read_csv(file_name)
        body_text = file_out.loc[:, 'text']
        
        
        if is_train:
            is_fake = file_out.loc[:, 'is_fake'].values
            self.is_fake = torch.tensor(is_fake).float()
            
        self.ID = file_out.loc[:, 'id'].values

#         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        
        tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], body_text))
        tokens_ids = list(map(tokenizer.convert_tokens_to_ids, tokens))
        tokens_ids = pad_sequences(tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
        
        masks = [[float(i > 0) for i in ii] for ii in tokens_ids]
        self.masks = torch.tensor(masks)


        self.body_text = torch.tensor(tokens_ids)
        
        
    
    def __len__(self):
        return len(self.body_text)
    
    
    def __getitem__(self, idx):
        if self.is_train:
            return self.body_text[idx], self.masks[idx], self.is_fake[idx], self.ID[idx]
        else:
            return self.body_text[idx], self.masks[idx], self.ID[idx]
        

In [None]:
train_set = FeatureDataset('train.csv', True)
test_set = FeatureDataset('test.csv', False)

### The model is bert base uncased with a classifier on top of it with sigmoid activation function

In [None]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        
#         self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, output_all_encoded_layers=False)
#         pooled_output = self.bert(tokens)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

### the train data is splitted to train(80%) and valid (20%) and the batch size was set to 1

In [None]:
torch.cuda.empty_cache()
CUDA_VISIBLE_DEVICES=2
batch_size = 1


validation_split = .2
shuffle_dataset = True
random_seed= 42

# train_sampler = torch.utils.data.RandomSampler(train_set)
# train_dataloader = torch.utils.data.DataLoader(train_set, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)


dataset_size = len(train_set)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
valid_sampler = torch.utils.data.SubsetRandomSampler(val_indices)

train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                                sampler=valid_sampler)

print(len(train_dataloader))
print(len(validation_loader))
print(len(test_dataloader))


### the model was  trained for 10 epochs and the best model according to accuracy on the validation set was saved. The loss is a binary cross entropy and the optimizer is Adam with learning rate =3e-6

In [None]:
EPOCHS = 10
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)
accuracy= 0
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels, IDs = tuple(t for t in batch_data)
        token_ids , masks, labels, IDs = token_ids.cuda(), masks.cuda(), labels.cuda(), IDs.cuda()
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_set) / batch_size, train_loss / (step_num + 1)))
    #save the model only if the accuracy on the test set was improved
    bert_clf.eval()
    predictions = []
    for step_num, batch_data in enumerate(validation_loader):
        token_ids, masks, true_label, IDs = tuple(t for t in batch_data)
        token_ids , masks, labels, IDs = token_ids.cuda(), masks.cuda(), labels.cuda(), IDs.cuda()
        probs = bert_clf(token_ids, masks)
        probs = probs.detach().cpu().numpy()[0][0]
        if probs < 0.5:
            pred_label = 0
        else:
            pred_label =1
        predictions.append(pred_label==true_label.cpu().numpy())
    if sum(predictions)/len(predictions)>accuracy:
        #update accuracy and save the model
        accuracy = sum(predictions)/len(predictions)
        print('accuracy was improved')
        print('accuracy= {}'.format(accuracy))
        torch.save(bert_clf.state_dict(), 'bert_clf.pt')
    else:
        print('accuracy was not improved')
# torch.save(bert_clf.state_dict(), 'bert_clf.pt')


### loading the saved model

In [None]:
bert_clf = BertBinaryClassifier()
bert_clf.load_state_dict(torch.load('bert_clf.pt'))
bert_clf.cuda()

### making predictions on the test set

In [None]:

bert_clf.eval()
pred = pd.DataFrame({'id':np.zeros(len(test_set)), 'is_fake':np.zeros(len(test_set))})
for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, IDs = tuple(t for t in batch_data)
        IDs = IDs.numpy()[0]
        token_ids , masks = token_ids.cuda(), masks.cuda()
        probs = bert_clf(token_ids, masks)
        probs = probs.detach().cpu().numpy()[0][0]
        pred.loc[step_num, 'id'] = IDs
        if probs < 0.5:
             pred.loc[step_num, 'is_fake'] = 0
        else:
            pred.loc[step_num, 'is_fake'] = 1
        print(step_num)
        
pred['id']= pred['id'].astype('int32')
pred['is_fake']= pred['is_fake'].astype('int32')
pred.to_csv('first_prediction.csv', index = False)