# Text Classification

### Import required libraries

In [33]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import re

### Read Dataset

In [34]:
with open('review_corpus_en.ndjson') as f:
    data = [json.loads(line) for line in f]

df = pd.DataFrame(data)
df['text'] = df['title'] + ' ' + df['body']

### Pre-process Text

In [35]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['title'] = df['title'].apply(preprocess_text)

### Map Labels

In [36]:
label_mapping = {'pos': 2, 'neg': 0, 'mixed': 1}
df['rating'] = df['rating'].map(label_mapping)

### Train Test Split

In [37]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [38]:
train_df['rating'].unique()

array([2, 0, 1])

### BERT Tokenizer

In [39]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts, labels, max_len=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_data = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)

    return input_ids, attention_masks, labels

train_inputs, train_masks, train_labels = tokenize_data(train_df['text'], train_df['rating'])
test_inputs, test_masks, test_labels = tokenize_data(test_df['text'], test_df['rating'])


### Create Dataset

In [40]:
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False
)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train Model

In [41]:
epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    model.train()

    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch_inputs, batch_masks, batch_labels = tuple(t.to(device) for t in batch)

        model.zero_grad()

        outputs = model(
            batch_inputs,
            token_type_ids=None,
            attention_mask=batch_masks,
            labels=batch_labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Training loss: {avg_train_loss}')

model.save_pretrained('fine_tuned_bert')
tokenizer.save_pretrained('fine_tuned_bert')

Epoch 1/6
Training loss: 0.8760951910416285
Epoch 2/6
Training loss: 0.5403571060299873
Epoch 3/6
Training loss: 0.3672856810192267
Epoch 4/6
Training loss: 0.20229494579136373
Epoch 5/6
Training loss: 0.12741058899089694
Epoch 6/6
Training loss: 0.08605067290831357


('fine_tuned_bert/tokenizer_config.json',
 'fine_tuned_bert/special_tokens_map.json',
 'fine_tuned_bert/vocab.txt',
 'fine_tuned_bert/added_tokens.json')

### Evaluate Model

In [42]:
from sklearn.metrics import classification_report
import numpy as np

model.eval()

predictions, true_labels = [], []

for batch in test_dataloader:
    batch_inputs, batch_masks, batch_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(
            batch_inputs,
            token_type_ids=None,
            attention_mask=batch_masks
        )

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

flat_predictions = [item for sublist in predictions for item in sublist]
flat_true_labels = [item for sublist in true_labels for item in sublist]

flat_predictions = np.argmax(flat_predictions, axis=1)

print(classification_report(flat_true_labels, flat_predictions, target_names=['neg', 'mixed', 'pos']))

              precision    recall  f1-score   support

         neg       0.76      0.73      0.74       197
       mixed       0.61      0.64      0.63       186
         pos       0.86      0.85      0.85       217

    accuracy                           0.75       600
   macro avg       0.74      0.74      0.74       600
weighted avg       0.75      0.75      0.75       600



### Model Loss

In [44]:
outputs = model(
            batch_inputs,
            token_type_ids=None,
            attention_mask=batch_masks,
            labels=batch_labels
        )

loss = outputs.loss
print(loss)

tensor(1.5041, device='cuda:0', grad_fn=<NllLossBackward0>)
