In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip /content/medium-articles-popularity.zip

In [None]:
!pip install transformers

In [3]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

In [4]:
random.seed(42)
np.random.seed(42)
torch.random.manual_seed(42)
torch.cuda.random.manual_seed(42)
torch.cuda.random.manual_seed_all(42)

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [7]:
data=pd.read_csv('/content/articles_train.csv')
data_test=pd.read_csv('/content/articles_test.csv')
data = data.sample(frac=1).reset_index(drop=True)

In [8]:
data['claps'].mean()

1449.1184771033013

In [None]:
data.head(3)

In [None]:
data_test.head(3)

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
def make_data(data,b=1):
  global scaler
  if b:
    scaler.fit(data['reading_time'][:,None])
    data.drop(['id'],axis=1,inplace=True)
  data['reading_time']=scaler.transform(data['reading_time'][:,None])
  data.drop(['link'],axis=1,inplace=True)
  return data

In [None]:
data=make_data(data)
data_test=make_data(data_test,0)

In [None]:
data.loc[0,'text']

In [None]:
from torch.utils.data import Dataset, random_split

class ArticlesDataset(Dataset):
    def __init__(self, data,tokenizer, labels):
        self.labels = labels
        self.read_time=data.reading_time
        self.tokenized = [tokenizer.encode(data.loc[i,'author'])+tokenizer.encode(data.loc[i,'title'])[1:]\
                          +tokenizer.encode(data.loc[i,'text'], max_length=250,truncation=True)[1:] for i in range(data.shape[0])]  
    def __getitem__(self, idx):
        return {"tokenized": self.tokenized[idx],"read_time":self.read_time[idx], "label": self.labels[idx]}

    def __len__(self):
        return len(self.labels)

dataset = ArticlesDataset(data,tokenizer, data['claps'])
train_data, valid_data = random_split(dataset, [int(0.8*len(dataset)), len(dataset)-int(0.8*len(dataset))])
test_data = ArticlesDataset(data_test,tokenizer, data_test['id'])
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

In [15]:
from torch.utils.data import Sampler

class ArticlesSampler(Sampler):
    def __init__(self, subset,b=1, batch_size=8):
        self.batch_size = batch_size
        self.subset = subset
        if b:
          self.indices = subset.indices
          self.tokenized = np.array(subset.dataset.tokenized)[self.indices]
        else:
          self.tokenized= np.array(subset.tokenized)

    def __iter__(self):
        batch_idx = []
        for index in np.argsort(list(map(len, self.tokenized))): # [ len (i) for i in self.tokenized ]
            batch_idx.append(index)
            if len(batch_idx) == self.batch_size:
                yield batch_idx
                batch_idx = []

        if len(batch_idx) > 0:
            yield batch_idx

    def __len__(self):
        return len(self.dataset)

In [None]:
from torch.utils.data import DataLoader

def get_padded(values):
    max_len = 0
    for value in values:
        if len(value) > max_len:
            max_len = len(value)

    padded = np.array([value + [0]*(max_len-len(value)) for value in values])

    return padded

def collate_fn(batch):

    text = []
    labels = []
    time = []
    for elem in batch:
        text.append(elem['tokenized'])
        time.append(elem['read_time'])
        labels.append(elem['label'])
    text = get_padded(text)
    attention_mask =np.where(text!=0,1,0) 

    return {'text': torch.tensor(text), 'labels': torch.FloatTensor(labels),'read_time':torch.tensor(time),\
            'attention_mask' : torch.tensor(attention_mask)}

train_loader = DataLoader(train_data, batch_sampler=ArticlesSampler(train_data), collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_sampler=ArticlesSampler(valid_data), collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_sampler=ArticlesSampler(test_data,0), collate_fn=collate_fn)

In [17]:
from torch import nn

class BertRegressor(nn.Module):
    def __init__(self, pretrained_model, dropout=0.1):
        super().__init__()

        self.bert = pretrained_model
        self.dropout = nn.Dropout(p=dropout)
        self.relu = nn.ReLU()
        self.fc1=nn.Linear(769,1)
    def forward(self, text, attention_mask,time):
        x=self.dropout(self.relu(model(text,attention_mask=attention_mask)[0][:,0,:]))
        time=time[:,None]
        x=torch.cat((time,x),axis=1)
        x=self.fc1(x)

        x=x*1550 

        return x

In [None]:
 torch.cuda.empty_cache()

In [18]:
import torch.optim as optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_clf = BertRegressor(model).to(device,dtype=float)

optimizer = optim.Adam(bert_clf.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=20, gamma=0.1)
criterion = nn.MSELoss()

In [19]:
def warming(model, iterator,num):
  optimizer = optim.Adam(model.parameters(), lr=1e-3)
  criterion = nn.MSELoss()
  model.train()
  epoch_loss = 0
  k=0
  epoh=30
  for i in range(epoh):
    for j,batch in enumerate(iterator):
      batch['text']=batch['text'].to(device)
      batch['attention_mask']=batch['attention_mask'].to(device)
      batch['read_time']=batch['read_time'].to(device)
      output=model(batch['text'],batch['attention_mask'],batch['read_time'])
      labels=batch['labels'].to(device,dtype=float)[:,None]

      optimizer.zero_grad()
      loss = criterion(output, labels) 
      epoch_loss+=loss.item()
      loss.backward()
      optimizer.step()

In [None]:
warming(bert_clf,train_loader,15)

In [20]:
def train(model, iterator, optimizer, criterion, clip, train_history=None, valid_history=None):
    model.train()
    
    epoch_loss = 0
    epoch_loss_mse = 0
    history = []
    for i, batch in enumerate(iterator):

        optimizer.zero_grad()
        batch['text']=batch['text'].to(device)
        batch['attention_mask']=batch['attention_mask'].to(device)
        batch['read_time']=batch['read_time'].to(device)
        output=model(batch['text'],batch['attention_mask'],batch['read_time'])
        labels=batch['labels'].to(device,dtype=float)[:,None]

        loss = criterion(output, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
      
        history.append(loss.cpu().data.numpy())
        
    return epoch_loss / (i + 1)

def evaluate(model, iterator, criterion,mse_loss=nn.MSELoss()):
    
    model.eval()
    
    epoch_loss = 0
    
    history = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            batch['text']=batch['text'].to(device)
            batch['attention_mask']=batch['attention_mask'].to(device)
            batch['read_time']=batch['read_time'].to(device)

            output=model(batch['text'],batch['attention_mask'],batch['read_time'])
            labels=batch['labels'].to(device,dtype=float)[:,None]
            loss = criterion(output, labels)
                      
            epoch_loss += loss.item()
            epoch_loss_mse=mse_loss(output.detach().to('cpu'),labels.detach().to('cpu'))

    return epoch_loss / (i + 1)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
np.set_printoptions(suppress=True)

In [23]:
train_history = []
valid_history = []

N_EPOCHS = 50
CLIP = 2

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(bert_clf, train_loader, optimizer, criterion, CLIP, train_history, valid_history)
    scheduler.step()
    valid_loss = evaluate(bert_clf, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(bert_clf.state_dict(), 'best-val-model.pt')
    
    train_history.append(train_loss)
    valid_history.append(valid_loss)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

In [None]:
best_model = BertRegressor(model).to(device,dtype=float)
best_model.load_state_dict(torch.load('/content/drive/MyDrive/best-val-model_0.pt'))