<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/news_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification

## Imports

In [1]:
%%capture
!pip install datasets

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import datasets

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List
import string

import seaborn
seaborn.set(palette='summer')

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

## Data preparation

In [4]:
dataset = datasets.load_dataset('ag_news')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

> Create a dictionary

> Create class WordDataset

> split data and load to DataLoader

### Create a dict of words

In [5]:
words = Counter()
for example in tqdm(dataset['train']['text']):
  processed_text = example.lower().translate(str.maketrans('', '', string.punctuation))
  for word in word_tokenize(processed_text):
    words[word] += 1

  0%|          | 0/120000 [00:00<?, ?it/s]

In [6]:
tags = ['<unk>', '<bos>',' <eos>', '<pad>']
vocab = set(tags)
counter_threshold = 25

In [7]:
for char, count in words.items():
  if count > counter_threshold:
    vocab.add(char)

In [9]:
print("Vocab size:", len(vocab))

Vocab size: 11842


In [10]:
word2idx = {char:i for i, char in enumerate(vocab)}
idx2word = {i:char for char, i in word2idx.items()}

### Create WordDataset class

In [15]:
class WordDataset:
  def __init__(self, sent):
    self.data = sent
    self.unk_id, self.bos_id, self.eos_id, self.pad_id  = [word2idx(tag) for tag in tags]

  def ___getitem__(self, idx: int) -> list:
    processed_txt = self.data[idx]['text'].lower().translate(str.maketrans('', '', string.punctuation))
    tok_sent = [self.bos_id]
    tok_sent += [word2idx.get(word, self.unk_id) for word in word_tokenize(processed_txt)]
    tok_sent += [self.eos_id]

    train_sample = {
        'text': tok_sent,
        'label': self.data[idx]['label']
    }
    return train_sample


  def __len__(self) -> int:
    return len(self.data)


def collate_fn_with_padding(input_batch: List[List[int]], pad_id = word2idx["<pad>"], max_len=256)-> torch.Tensor:
  seq_lens = [len[x['text']] for x in input_batch]
  max_seq_len = min(max(seq_lens), max_len)

  new_batch = []
  for seq in input_batch:
    seq['text'] = seq['text'][:max_seq_len]
    for _ in range(max_seq_len - len(seq['text'])):
      seq['text'].append(pad_id)

    new_batch.append(seq['text'])

  seqs = torch.LongTensor(new_batch).to(device)
  labels = torch.LongTensor(x['label'] for x in input_batch).to(device)

  new_batch = {'input_idx':seqs,
                 'label': labels}
  return new_batch

### Split a load data

In [None]:
train_dataset = WordDataset(dataset['train'])

np.random.seed(42)
idx = np.random.choice(np.arange(len(dataset['test'])), 5000)
eval_dataset = WordDataset(dataset['test'].select(idx))

batch_size = 32
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=collate_fn_with_padding, batch_size=batch_size)

eval_dataloader = DataLoader(eval_dataset, shuffle=False, collate_fn=collate_fn_with_padding, batch_size=batch_size)