In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import re
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
max_length = 4096
tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
device

device(type='cuda', index=0)

In [4]:
train_data = pd.read_csv("../dataset/train.csv")
test_data = pd.read_csv("../dataset/test.csv")

In [5]:
train_data = train_data.fillna("")
test_data = test_data.fillna("")

In [6]:
idx = int(len(train_data) * 0.8)
val_data = train_data.iloc[:idx, :]
train_data_data = train_data.iloc[idx:, :]

# Data exploration

In [4]:
train_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
profile = ProfileReport(train_data)
profile.to_file("my_report.html")

Summarize dataset: 100%|██████████| 16/16 [00:23<00:00,  1.49s/it, Completed]                
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.63it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 143.42it/s]


In [5]:
train_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

# Inefficinet way of processing data

In [5]:
def clean_text(text: str):
	text = text.lower()

	text = text.encode("ascii", "ignore").decode() # remove non-ascii characters(emojis, chinese, japanese, etc)
	text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
	text = re.sub(r"http\S+", "",text)
	punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
	for p in punctuations:
		text = text.replace(p,'')

	return text

In [6]:
train_data["text"] = train_data['text'].apply(lambda text: clean_text(text))
test_data["text"] = test_data['text'].apply(lambda text: clean_text(text))

In [7]:
train_data = train_data.iloc[:, -2:]
test_text = test_data["text"]

In [8]:
train_text = train_data["text"]
train_labels = train_data["label"]

In [11]:
def tokenize(text) -> torch.tensor:
	encoded = tokenizer.encode_plus(text, max_length=max_length, truncation=True, add_special_tokens=True, return_token_type_ids=False, padding='max_length',
    return_attention_mask=True,
    return_tensors='pt')

	return encoded['input_ids'][0], encoded['attention_mask'][0]

In [12]:
train_tokens_list = []
train_attn_masks_list = []
for text in list(train_text):
	tokens, attn_masks = tokenize(text)
	train_tokens_list.append(tokens.numpy())
	train_attn_masks_list.append(attn_masks.numpy())

In [13]:
test_tokens_list = []
test_attn_masks_list = []
for text in list(test_text):
	tokens, attn_masks = tokenize(text)
	test_tokens_list.append(tokens.numpy())
	test_attn_masks_list.append(attn_masks.numpy())

In [14]:
train_tokenized_text_df = pd.DataFrame(train_tokens_list)
test_tokenized_text_df = pd.DataFrame(test_tokens_list)

KeyboardInterrupt: 

In [None]:
X_train, X_val, y_train, y_val, train_mask, val_mask = train_test_split(train_tokenized_text_df, train_labels, train_attn_masks_list, test_size=0.2, random_state=42)

# Efficient

In [7]:
from typing import Any


class FakeNewsDataset(Dataset):
	def __init__(self, dataset: pd.DataFrame, transform=None, has_labels=True):
		self.fake_news_dataset = dataset
		self.transform = transform
		self.has_labels = has_labels
	
	def __len__(self):
		return len(self.fake_news_dataset)
	
	def __getitem__(self, idx):
		text = self.fake_news_dataset.loc[idx, "text"]
		if self.transform:
			text = self.transform(text)

		if self.has_labels:
			label = self.fake_news_dataset.loc[idx, "label"]
			return text, label
		
		return text
	
class CleanText(object):
	def __call__(self, text):
		text = text.lower()

		text = text.encode("ascii", "ignore").decode() # remove non-ascii characters(emojis, chinese, japanese, etc)
		text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
		text = re.sub(r"http\S+", "",text)
		punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
		for p in punctuations:
			text = text.replace(p,'')

		return text

class TokenizeTransform(object):
	def __call__(self, text):
		encoded = tokenizer.encode_plus(text, max_length=max_length, truncation=True, add_special_tokens=True, return_token_type_ids=False, padding='max_length',
		return_attention_mask=True,
		return_tensors='pt')

		return encoded['input_ids'][0], encoded['attention_mask'][0]

In [8]:
transforms = transforms.Compose([
	CleanText(),
	TokenizeTransform()
])

batch_size = 8

train_dataset = FakeNewsDataset(train_data, transform=transforms)
val_dataset = FakeNewsDataset(val_data, transform=transforms)
test_dataset = FakeNewsDataset(test_data, transform=transforms, has_labels=False)

train_dataset_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataset_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
next(iter(train_dataset)) # loader output

((tensor([   0, 3138, 4410,  ...,    1,    1,    1]),
  tensor([1, 1, 1,  ..., 0, 0, 0])),
 1)

In [12]:
for i, batch in enumerate(train_dataset_loader):
	print(batch)
	break

[[tensor([[    0,   605, 40886,  ...,     1,     1,     1],
        [    0, 27659,   536,  ...,     1,     1,     1],
        [    0,  8396,   662,  ...,     1,     1,     1],
        ...,
        [    0, 32019,  2963,  ...,     1,     1,     1],
        [    0, 17341,    70,  ...,     1,     1,     1],
        [    0,  2137,     5,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])], tensor([0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 1, 0])]


In [9]:
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=2, output_attentions = False, output_hidden_states = False)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model = model.to(device)

In [27]:
model.__class__.__module__

'transformers.models.longformer.modeling_longformer'

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5, eps = 1e-8)

In [12]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [13]:
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

epochs = 3

for epoch in range(epochs):	
	print("Epoch {}".format(epoch + 1))
	print("Training")

	train_epoch_loss = 0
	model.train()

	for i, batch in enumerate(train_dataset_loader):
		b_inputs = batch[0][0].to(device)
		b_attn_masks = batch[0][1].to(device)
		b_labels = batch[1].to(device)

		optimizer.zero_grad()

		loss, logits = model(b_inputs, token_type_ids=None, attention_mask=b_attn_masks, labels=b_labels)
		train_epoch_loss += loss.item()

		loss.backward()
		optimizer.step()

	avg_epoch_loss = train_epoch_loss / len(train_dataset_loader)
	print("train loss {}".format(avg_epoch_loss))

	print("Validation")
	model.eval()

	val_epoch_loss = 0
	val_epoch_accuracy = 0

	for i, batch in enumerate(val_dataset_loader):
		b_inputs = batch[0][1].to(device)
		b_attn_masks = batch[0][1].to(device)
		b_labels = batch[1].to(device)

		with torch.no_grad():
			loss, logits = model(b_inputs, token_type_ids=None, attention_mask=b_attn_masks, labels=b_labels)
		
		val_epoch_loss += loss.item()

		logits = logits.detach().cpu().numpy()
		labels = b_labels.to("cpu").numpy()

		val_epoch_accuracy += flat_accuracy(logits, b_labels)

		avg_val_epoch_loss = val_epoch_loss / len(val_dataset_loader)
		avg_val_epoch_accuracy = val_epoch_accuracy / len(val_dataset_loader)

		print("validation loss {}".format(avg_epoch_loss))
		print("validation accuracy {}".format(avg_val_epoch_accuracy))



Epoch 1
Training


OutOfMemoryError: CUDA out of memory. Tried to allocate 772.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 8.01 GiB is allocated by PyTorch, and 2.00 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF