# Text Classification on Ag News

In [1]:
import numpy as np
import pandas as pd
import string
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
import torch
from sentence_transformers import SentenceTransformer

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
#!python -m spacy download en_core_web_md

import gensim

import warnings
warnings.filterwarnings('ignore')



In [15]:
# Read in data
# Data from https://www.kaggle.com/amananandrai/ag-news-classification-dataset
train_df = pd.read_csv('./data/agnews/train.csv')

# Combine title and description of article to use as input documents for model
train_df['full_text'] = train_df.apply(lambda x: ' '.join([x['Title'],x['Description']]),axis=1)

test_df = pd.read_csv('./data/agnews/test.csv')
test_df['full_text'] = test_df.apply(lambda x: ' '.join([x['Title'],x['Description']]),axis=1)

ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}

In [16]:
# View a couple of the documents
for i in range(5):
    print(train_df.iloc[i]['full_text'])
    print()

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.

Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.

Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.

Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.

Oil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world

In [18]:
# Define tokenizer using spacy tokenizer.  Remove stopwords & punctuation and lemmatize
#nlp = spacy.load('en_core_web_md')
parser = English()
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

def spacy_tokenizer(sentence,parser,punctuations,stopwords):
    #mytokens = nlp(sentence)
    mytokens = parser(sentence)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [19]:
# Tokenize training set text using spacy tokenizer
tqdm.pandas()
train_df['processed_text'] = train_df['full_text'].progress_apply(lambda x: spacy_tokenizer(x,parser,punctuations,stopwords))

# Tokenize test set text using spacy tokenizer
tqdm.pandas()
test_df['processed_text'] = test_df['full_text'].progress_apply(lambda x: spacy_tokenizer(x,parser,punctuations,stopwords))

100%|██████████| 120000/120000 [01:07<00:00, 1768.50it/s]
100%|██████████| 7600/7600 [00:04<00:00, 1552.99it/s]


## Modeling using TFIDF features
In the example below we build a model using TFIDF to generate our features, with no embeddings.  We will use a simple softmax regression as our classification model.

In [99]:
# Create features using Tfidf
tfidf=TfidfVectorizer()
X_train=tfidf.fit_transform(train_df['processed_text'])
X_test=tfidf.transform(test_df['processed_text'])

In [100]:
# Train a classification model using logistic regression classifier
y_train = train_df['Class Index']
logreg_model = LogisticRegression(solver='saga')
logreg_model.fit(X_train,y_train)
preds = logreg_model.predict(X_train)
acc = sum(preds==y_train)/len(y_train)
print('Accuracy on the training set is {:.3f}'.format(acc))

Accuracy on the training set is 0.944


In [101]:
# Evaluate accuracy on the test set
y_test = test_df['Class Index']
test_preds = logreg_model.predict(X_test)
test_acc = sum(test_preds==y_test)/len(y_test)
print('Accuracy on the test set is {:.3f}'.format(test_acc))

Accuracy on the test set is 0.917


## Model using Doc2Vec embeddings
The Doc2Vec algorithm was introduced in 2014 by Le and Mikolov to overcome the issues associated with simple averaging of Word2Vec vectors to form a representation of a document as an average of the words in the document.  

The key innovation was the addition of another floating vector which contributes to all training predictions and is updated like other word vectors but represents the document rather than an individual word. Gensim’s Doc2Vec class implements this algorithm.

See https://arxiv.org/pdf/1405.4053v2.pdf

In [100]:
# Function to generate gensim tokens and tags from a text corpus
import smart_open
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

# Function to generate gensim tokens and tags from a list of text docs 
def read_corpus_from_list(lst, tokens_only=False):
        for i, line in enumerate(lst):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus_from_list(train_df['processed_text'].tolist()))

Now that we have our corpus, we first need to build the vocabulary.  Essentially, the vocabulary is a list (accessible via `model.wv.index_to_key`) of all of the unique words extracted from the training corpus. Additional attributes for each word are available using the `model.wv.get_vecattr()` method.

After our vocabulary is build, we train our embedding model using the training corpus.

In [111]:
# Build vocabulary and train embedding model on the training corpus
# dbow_words=0 uses pre-trained word embeddings and only trains the document embeddings
doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
doc2vec_model.build_vocab(train_corpus)
doc2vec_model.train(train_corpus, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

Now that our embedding model is trained, we can use it to get the embeddings for the training and test sets using `model.infer_vector()` for each document in the set.  We can then use the vectors as representations of the documents to do things such as evaluate similarity using cosine similarity.

In [112]:
# Use the embedding model to get the embedding vectors for the training set
X_train_doc2vec = list(read_corpus_from_list(train_df['processed_text'].tolist(),tokens_only=True))
X_train_doc2vec = [doc2vec_model.infer_vector(toks) for toks in X_train_doc2vec]

Finally, we will used our embeddings as features to train a softmax regression model to classify the documents.

In [113]:
# Train a classification model using logistic regression classifier
y_train = train_df['Class Index']
logreg_model = LogisticRegression(solver='saga')
logreg_model.fit(X_train_doc2vec,y_train)
preds = logreg_model.predict(X_train_doc2vec)
acc = sum(preds==y_train)/len(y_train)
print('Accuracy on the training set is {:.3f}'.format(acc))

Accuracy on the training set is 0.670


In [114]:
# Evaluate performance on the test set
X_test_doc2vec = list(read_corpus_from_list(test_df['full_text'].tolist(),tokens_only=True))
X_test_doc2vec = [doc2vec_model.infer_vector(toks) for toks in X_test_doc2vec]

y_test = test_df['Class Index']
preds = logreg_model.predict(X_test_doc2vec)
acc = sum(preds==y_test)/len(y_test)
print('Accuracy on the training set is {:.3f}'.format(acc))

Accuracy on the training set is 0.740


## Model using PyTorch embeddings
Let's now use PyTorch embeddings learned from scratch (no pre-training) to represent our documents as features for a classification model.  We will create embeddings for each word in our document and then use the mean embedding for all words in a document as the embedding for the document.  The document embedding will be used as the features to feed into a single-layer classifier which performs softmax regression with cross entropy loss to classify the documents.  Prior to creating embeddings we will use our spacy tokenizer to pre-process the text.

Reference the PyTorch embeddings tutorial: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [37]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch import nn

In [65]:
# Set up iterators for training and test data
train_iter = [(label,text) for label,text in zip(train_df['Class Index'].to_list(),train_df['processed_text'].to_list())]
test_iter = [(label,text) for label,text in zip(test_df['Class Index'].to_list(),test_df['processed_text'].to_list())]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create PyTorch train and test datasets from iterators
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)

# Split training data to get a validation set
split_train_dataset, split_valid_dataset = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

In [66]:
# Get tokens from pre-processed text
def yield_tokens(data_iter,tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

# Build vocabulary from tokens of training set
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(yield_tokens(train_iter,tokenizer), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [67]:
# Create the dataloader using custom collate_batch function to get single collated tensor for batch and offsets
# in form needed by nn.EmbeddingBag

def collate_batch(batch,tokenizer,vocab):
    # Pipelines for processing text and labels
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)    


BATCH_SIZE = 64 # batch size for training
train_dataloader = DataLoader(split_train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn = lambda x: collate_batch(x,tokenizer,vocab))
valid_dataloader = DataLoader(split_valid_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn = lambda x: collate_batch(x,tokenizer,vocab))
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn = lambda x: collate_batch(x,tokenizer,vocab))

In [68]:
# Define the model
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        # Embedding layer
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, mode="mean",sparse=True)
        # Fully connected final layer to convert embeddings to output predictions
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [69]:
def train(dataloader, model, criterion, optimizer, epoch):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model.forward(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
    return

def evaluate(dataloader, model):
    nn_model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model.forward(text, offsets)
            #loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [70]:
# Hyperparameters
EPOCHS = 5 # epoch
LR = 5  # learning rate

# Instantiate the model
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
embed_size = 64
nn_model = TextClassificationModel(vocab_size, embed_size, num_class).to(device)
  
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

# Train model
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, nn_model, criterion, optimizer, epoch)
    accu_val = evaluate(valid_dataloader, nn_model)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 1782 batches | accuracy    0.750
| epoch   1 |  1000/ 1782 batches | accuracy    0.874
| epoch   1 |  1500/ 1782 batches | accuracy    0.890
-----------------------------------------------------------
| end of epoch   1 | time:  9.09s | valid accuracy    0.892 
-----------------------------------------------------------
| epoch   2 |   500/ 1782 batches | accuracy    0.910
| epoch   2 |  1000/ 1782 batches | accuracy    0.911
| epoch   2 |  1500/ 1782 batches | accuracy    0.910
-----------------------------------------------------------
| end of epoch   2 | time:  8.91s | valid accuracy    0.905 
-----------------------------------------------------------
| epoch   3 |   500/ 1782 batches | accuracy    0.928
| epoch   3 |  1000/ 1782 batches | accuracy    0.925
| epoch   3 |  1500/ 1782 batches | accuracy    0.925
-----------------------------------------------------------
| end of epoch   3 | time:  8.87s | valid accuracy    0.908 
-------------------------------

In [71]:
# Evaluate performance on the test dataset
accu_test = evaluate(test_dataloader, nn_model)
print('test accuracy {:8.3f}'.format(accu_test))

test accuracy    0.900


## Model using Sentence Transformer
Sentence Transformer was developed in 2019 and uses Siamese-BERT to develop semantically meaningful sentence embeddings which can be compared using cosine similarity.  See details in paper: https://arxiv.org/abs/1908.10084

You can use a pretrained embedding model (see list at https://www.sbert.net/docs/pretrained_models.html) or can train your own on a corpus.

In [17]:
#Encode all documents using pre-trained model
senttrans_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
X_train_sentransformer = train_df['full_text'].values.tolist()
X_train_sentransformer = [senttrans_model.encode(doc) for doc in X_train_sentransformer]

KeyboardInterrupt: 

In [8]:
# Train a classification model using logistic regression classifier
y_train = train_df['Class Index']
logreg_model = LogisticRegression(solver='saga')
logreg_model.fit(X_train_sentransformer,y_train)
preds = logreg_model.predict(X_train_sentransformer)
acc = sum(preds==y_train)/len(y_train)
print('Accuracy on the training set is {:.3f}'.format(acc))

Accuracy on the training set is 0.888
