In [1]:
import transformers
from transformers import AutoModel, AutoTokenizer,AutoConfig, BertModel, BertTokenizer,BertConfig, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


import random,os 
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import warnings
warnings.filterwarnings('ignore')

In [2]:
def seed_torch(seed):
    seed = int(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic =True
RANDOM_SEED = 42
seed_torch(RANDOM_SEED)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
train=pd.read_csv('../input/comp3027j-assignment2-bdic2022/train.csv')
test=pd.read_csv('../input/comp3027j-assignment2-bdic2022/test.csv')
sub=pd.read_csv('../input/comp3027j-assignment2-bdic2022/sample_submission.csv')
print("train.shape：{},test.shape：{},sub.shape：{}".format(train.shape,test.shape,sub.shape))

train.shape：(200000, 4),test.shape：(100000, 3),sub.shape：(100000, 2)


In [5]:
train['text'] = train['title'] + '[SEP] ' + train['body']
train['text'] = train['text'].apply(lambda x:x.lower())
X, y = train['text'], train['category']

In [6]:
y.value_counts()

4     29404
26    26094
9     19605
2     15210
1     10364
0      9658
7      8140
8      7497
13     7103
14     5441
29     4993
12     4652
17     4489
27     4068
25     3767
30     3685
24     3404
11     2955
10     2771
22     2580
31     2422
16     2389
28     2369
3      2296
19     2244
5      2160
20     2053
21     2041
15     1759
6      1541
18     1460
23     1386
Name: category, dtype: int64

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=RANDOM_SEED)

In [8]:
X_train.shape,X_test.shape

((160000,), (40000,))

In [9]:
train['text'].apply(lambda x:len(x.split(' '))).describe()

count    200000.000000
mean        241.759190
std         466.160979
min           3.000000
25%          28.000000
50%          43.000000
75%         302.000000
max       37465.000000
Name: text, dtype: float64

In [10]:
train[train['text'].apply(lambda x:len(x.split(' ')))<400].shape

(160438, 5)

In [11]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [12]:
class NewsDataset(Dataset):
    def __init__(self,texts,labels,tokenizer,max_len):
        self.texts=texts
        self.labels=labels
        self.tokenizer=tokenizer
        self.max_len=max_len
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,item):
        text=str(self.texts[item])
        label=self.labels[item]
        
        encoding=self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'labels':torch.tensor(label,dtype=torch.long)
        }

In [13]:
def create_data_loader(X,y,tokenizer,max_len,batch_size):
    ds=NewsDataset(
        texts=X.values,
        labels=y.values,
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
    )

In [14]:
BATCH_SIZE = 4
MAX_LEN = 300
train_data_loader = create_data_loader(X_train,y_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(X_test,y_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [15]:
next(iter(train_data_loader))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[  101,  1037,  4435,  ...,     0,     0,     0],
         [  101,  2057, 24335,  ..., 17167,  8244,   102],
         [  101,  4639,  2143,  ...,     0,     0,     0],
         [  101,  2053,  1011,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([27, 26,  7, 14])}

In [16]:
bert_model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
class NewsClassifier(nn.Module):
    def __init__(self, n_classes):
        super(NewsClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict = False
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [18]:
EPOCHS = 2

model = NewsClassifier(32)
model = model.to(device)
optimizer = AdamW([{'params':[ param for name, param in model.named_parameters() if 'fc1' not in name]}], lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
print(f'total_steps:{total_steps}')

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


total_steps:80000


In [19]:
def train_epoch(model, data_loader, loss_fn, optimizer, 
                device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [20]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [21]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(X_train)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(X_test)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_albert_model_state.bin')
        best_accuracy = val_acc

Epoch 1/2
----------
Train loss 0.941874593997981 accuracy 0.74041875
Val   loss 0.8077190481196507 accuracy 0.7769750000000001

Epoch 2/2
----------
Train loss 0.6253044415330307 accuracy 0.8375187500000001
Val   loss 0.9113312979148672 accuracy 0.788375



In [22]:
test['text'] = test['title'] + '[SEP] ' + test['body']
test['text'] = test['text'].apply(lambda x:x.lower())

In [23]:
class TestDataset(Dataset):
    def __init__(self,texts,tokenizer,max_len):
        self.texts=texts
        self.tokenizer=tokenizer
        self.max_len=max_len
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,item):
        text=str(self.texts[item])
        
        encoding=self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
        }

    
test_data_loader = DataLoader(
        TestDataset(test['text'], tokenizer, MAX_LEN),
        batch_size=BATCH_SIZE,
    )

In [24]:
next(iter(test_data_loader))

{'input_ids': tensor([[  101, 21025,  2546,  ...,     0,     0,     0],
         [  101,  2322,  4109,  ...,     0,     0,     0],
         [  101,  2317,  2160,  ...,     0,     0,     0],
         [  101,  9815, 16976,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [25]:
model = model.eval()

predictions = []
prediction_probs = []


with torch.no_grad():
    for d in test_data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)

        probs = F.softmax(outputs, dim=1)

        predictions.extend(preds)
        prediction_probs.extend(probs)

predictions = torch.stack(predictions).cpu()
prediction_probs = torch.stack(prediction_probs).cpu()


In [26]:
from tqdm import tqdm
def pred(model):
    model = model.eval()

    predictions = []
    prediction_probs = []


    with torch.no_grad():
        for d in tqdm(test_data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(probs)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()


In [27]:
probs.cpu().numpy().shape

(4, 32)

In [28]:
sub['category'] = pd.Series(predictions.numpy())
sub[['id','category']].to_csv('submission.csv',index=False)