In [1]:
import numpy as np
import torch
import torch.nn as nn

In [2]:
import json
import pandas as pd

df = pd.read_json('./train.json/train.json',lines=True)

In [3]:
df['summary'].isnull().sum()

35

In [4]:
df = df[df['summary'].notna()]
df['summary'] = df['category'] +', '+ df['summary']

In [6]:
summary = df['summary'].values
rating = df['overall'].values - 1

In [7]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
max_len = 0
for sent in summary:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  79


In [None]:
input_ids = []
attention_masks = []

for sent in summary:
    encoded_dict = tokenizer.encode_plus(
                        sent,             
                        add_special_tokens = True, 
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
rating = torch.tensor(rating)
rating = rating.float()

In [11]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, rating)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [12]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
        )

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 1,   
    output_attentions = False,
    output_hidden_states = False,
)
model.to(device)
print('')

In [15]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8)

In [16]:
# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

# optimizer_grouped_parameters = [
#     {
#         'params':
#         [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#         'weight_decay':
#         0.01
#     },
#     {
#         'params':
#         [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#         'weight_decay':
#         0.0
#     }
# ]
# optimizer = AdamW(optimizer_grouped_parameters,
#                      lr=2e-05,
#                      eps = 1e-8)

In [17]:
from transformers import get_linear_schedule_with_warmup

epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [18]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [19]:
torch.cuda.empty_cache()
loss_function = torch.nn.MSELoss()

In [None]:
import numpy as np

total_t0 = time.time()

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0

    model.train()
    
    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        
        loss, predict = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        
        #loss = loss_function(predict, b_labels.view(-1,1))
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            (loss, predict) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        #loss = loss_function(predict, b_labels.view(-1,1))
        total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [43]:
df1 = pd.read_json('./test.json/test.json',lines=True)
df1['summary'].fillna(df1['reviewText'], inplace=True)

In [44]:
len(df1)

10000

In [45]:
df2 = pd.read_csv('./rating_pairs.csv')

In [46]:
df1['summary'] = df1['category'] +', '+ df1['summary']

In [47]:
df1['summary']

0       Pop, Fantastic mix of "old school" with a crea...
1            Pop, Digitally Extracted Stereo (DES) Rules!
2                          Pop, Excellent unplugged album
3                                Pop, another masterpiece
4                     Alternative Rock, True Classic Rock
                              ...                        
9995    Alternative Rock, 3 1/2 stars for a really goo...
9996                        Alternative Rock, Three Stars
9997                       Jazz, Glad to have another gem
9998                        Alternative Rock, Three Stars
9999     Pop, Kelly did a very good job in making that CD
Name: summary, Length: 10000, dtype: object

In [None]:
summary_test = df1['summary'].values

input_ids = []
attention_masks = []
for sent in summary_test:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = max_len ,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',
                        truncation=True
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [37]:
torch.save(model.state_dict(),"weight_1_best.pth")

In [49]:
model.eval()
a = model(input_ids[0].to(device).unsqueeze(0), token_type_ids=None, attention_mask=attention_masks[0].to(device).unsqueeze(0))[0]
a.detach().cpu().numpy()[0][0]

3.9563386

In [50]:
test_label=[]
for in_ids, mask, in zip(input_ids, attention_masks):
    in_ids = in_ids.to(device)
    mask = mask.to(device)
    logits_str = model(in_ids.unsqueeze(0), token_type_ids=None, attention_mask=mask.unsqueeze(0))[0]
    test_label.append(logits_str.detach().cpu().numpy()[0][0])

In [51]:
test_label = np.asarray(test_label).reshape(-1) + 1

In [53]:
test_label[test_label>5] = 5
test_label[test_label<1] = 1

In [54]:
min(test_label)

1.0

In [59]:
index = 0
predictions = open('rating_predictions.csv', 'w')
for l in open('rating_pairs.csv'):
    if l.startswith('userID'):
        #header
        predictions.write(l)
        continue
    u,p = l.strip().split('-')
    predictions.write(u + '-' + p + ',' + str(test_label[index]) + '\n')
    index += 1

In [60]:
index

10000