In [2]:
import numpy as np
import re
import csv
import time
import nltk
import string
from collections import defaultdict
import tqdm.notebook as tqdm
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from gensim import utils, models
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### 0. Some utility functions that will be used lately



In [37]:
# untility functions for text cleaning

stop_words = stopwords.words('english')

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punc(text):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return text.translate(table)

def remove_digits(text): 
    pattern = '[0-9]'
    text = re.sub(pattern, '', text)
    return text

def html_unescape(text):
    return html.unescape(text)

def reduce_length(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

#tokenize sentence and correct the spelling
def token_n_spellcheck(text):
    words = word_tokenize(text)
    reduced_text  = [reduce_length(word) for word in words]
    stemmer = SnowballStemmer("english")
    stem_text = [stemmer.stem(word) for word in reduced_text if word not in stop_words]

    return stem_text

# the pipeline function for text cleaning
def text_clean(text):
    text = text.lower()
    text = remove_URL(text)
    text = remove_html(text)
    text = remove_digits(text)
    text = remove_punc(text)
    words = token_n_spellcheck(text)
    return words


In [None]:
# utility functions for train, test, and calculating metrics

def train_LR(X, y):
    clf = LogisticRegression(multi_class='ovr')
    clf.fit(X, y)
    return clf

def predict(X, clf):
    return clf.predict(X)

def train_n_test(X, y):
    X_train, X_left, y_train, y_left = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test = X_left[:int(X_left.shape[0]/2), :], X_left[(int(X_left.shape[0]/2)+1):, :]
    y_val, y_test = y_left[:int(y_left.shape[0]/2)], y_left[(int(y_left.shape[0]/2)+1):]
    clf = train_LR(X_train, y_train)
    y_train_pre = predict(X_train, clf)
    print ('Training set acc: {}'.format(np.sum(y_train_pre == y_train)/len(y_train)))
    y_pre = predict(X_test, clf)
    return y_pre, y_test

def cal_acc(y_pre, y_gt):
    return (np.sum(y_pre == y_gt)/len(y_pre))

def cal_macro_f1(y_pre, y_gt):
    confusion_mat = np.zeros((3,3))
    for pre, gt in zip(y_pre, y_gt):
        confusion_mat[int(pre), int(gt)] += 1
    f1_scores = list()
    for i in range(3):
        precision = confusion_mat[i,i] / np.sum(confusion_mat[i, :])
        recall = confusion_mat[i,i] / np.sum(confusion_mat[:, i])
        f1_scores.append(2 * precision * recall / (precision + recall))
    return np.mean(f1_scores)

def cal_micro_f1(y_pre, y_gt):
    tp = np.sum(y_pre == y_gt)
    errs = len(y_pre) - tp
    return tp/(tp + errs)

def print_evaluation(y_pre, y_gt):
    eval_metrics = ['acc', 'macro_f1', 'micro_f1']
    for metric in eval_metrics:
        res = eval('cal_{}'.format(metric))(y_pre, y_gt)
        print ('Test set {}: {}'.format(metric, res))
            

In [41]:
# load NYT data and apply the text_clean function 

text_data = list()

with open('nyt.csv', 'r') as f:
    reader = csv.reader(f)
    # skip the header
    next(reader, None)
    for row in reader:
        text_data.append(row)

docs = list()
labels = list()
words = set()

for text, label in tqdm.tqdm(text_data):
    doc_words = text_clean(text)
    docs.append(doc_words)
    words.update(set(doc_words))
    labels.append(label)

labels = list(set(labels))

HBox(children=(FloatProgress(value=0.0, max=11519.0), HTML(value='')))




In [None]:
print ('Labels: ', labels)
print ('Total dictionary size: {}'.format(len(words)))

word2ind, ind2word = dict(), dict()

for ind, word in enumerate(words):
    word2ind[word] = ind
    ind2word[ind] = word

Labels:  ['business', 'politics', 'sports']
Total dictionary size: 43489


### 1. Bag of Words

(a) Binary-valued vector representation

In [None]:
def generate_matrix_binary(docs, text_data, word2ind):
    X, y = np.zeros([len(docs), len(word2ind)]), np.zeros(len(docs))
    for i, doc_words in enumerate(docs):
        label = text_data[i][1]
        if label == 'sports':
            y[i] = 0
        elif label == 'business':
            y[i] = 1
        elif label == 'politics':
            y[i] = 2
        for word in doc_words:
            X[i][word2ind[word]] = 1
    return csr_matrix(X), y

X, y = generate_matrix_binary(docs, text_data, word2ind)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)



Training set acc: 1.0
Test set acc: 0.9826238053866203
Test set macro_f1: 0.951558633150673
Test set micro_f1: 0.9826238053866203


(b) Frequency-valued vector representation

In [None]:
def generate_matrix_freq(docs, text_data, word2ind):
    X, y = np.zeros([len(docs), len(word2ind)]), np.zeros(len(docs))
    for i, doc_words in enumerate(docs):
        label = text_data[i][1]
        if label == 'sports':
            y[i] = 0
        elif label == 'business':
            y[i] = 1
        elif label == 'politics':
            y[i] = 2
        for word in doc_words:
            X[i][word2ind[word]] += 1
    return csr_matrix(X), y

X, y = generate_matrix_freq(docs, text_data, word2ind)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)

Training set acc: 1.0
Test set acc: 0.9904430929626412
Test set macro_f1: 0.9713993887522795
Test set micro_f1: 0.9904430929626412


(c) TFIDF-valued vector representation

Raw TF * Raw IDF

In [None]:
def generate_matrix_tfidf(docs, text_data, word2ind):
    df_cnt = defaultdict(float)
    for doc_words in docs:
        words = set(doc_words)
        for w in words:
            df_cnt[w] += 1
    X, y = np.zeros([len(docs), len(word2ind)]), np.zeros(len(docs))
    tf_cnt = defaultdict(int)
    for i, doc_words in enumerate(docs):
        label = text_data[i][1]
        if label == 'sports':
            y[i] = 0
        elif label == 'business':
            y[i] = 1
        elif label == 'politics':
            y[i] = 2
        for word in doc_words:
            tf_cnt[word] += 1
        for word in doc_words:
            # using the raw tf-idf
            X[i, word2ind[word]] = tf_cnt.get(word, 0) / (df_cnt[word])
    return csr_matrix(X), y

X, y = generate_matrix_tfidf(docs, text_data, word2ind)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)

Training set acc: 0.9992403689636462
Test set acc: 0.9748045178105995
Test set macro_f1: 0.9380544093462754
Test set micro_f1: 0.9748045178105995


Raw TF * Smoothed IDF

In [None]:
def generate_matrix_tfidf(docs, text_data, word2ind):
    df_cnt = defaultdict(float)
    for doc_words in docs:
        words = set(doc_words)
        for w in words:
            df_cnt[w] += 1
    X, y = np.zeros([len(docs), len(word2ind)]), np.zeros(len(docs))
    tf_cnt = defaultdict(int)
    for i, doc_words in enumerate(docs):
        label = text_data[i][1]
        if label == 'sports':
            y[i] = 0
        elif label == 'business':
            y[i] = 1
        elif label == 'politics':
            y[i] = 2
        for word in doc_words:
            tf_cnt[word] += 1
        for word in doc_words:
            # replacing the raw idf with the smoothed idf here
            X[i, word2ind[word]] = tf_cnt.get(word, 0) * np.log((len(docs) + 1) / (df_cnt[word] + 0.5))
    return csr_matrix(X), y

X, y = generate_matrix_tfidf(docs, text_data, word2ind)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)

Training set acc: 1.0
Test set acc: 0.9704604691572546
Test set macro_f1: 0.9244609260386428
Test set micro_f1: 0.9704604691572546


Log TF * Smoothed IDF

In [None]:
def generate_matrix_tfidf(docs, text_data, word2ind):
    df_cnt = defaultdict(float)
    for doc_words in docs:
        words = set(doc_words)
        for w in words:
            df_cnt[w] += 1
    X, y = np.zeros([len(docs), len(word2ind)]), np.zeros(len(docs))
    tf_cnt = defaultdict(int)
    for i, doc_words in enumerate(docs):
        label = text_data[i][1]
        if label == 'sports':
            y[i] = 0
        elif label == 'business':
            y[i] = 1
        elif label == 'politics':
            y[i] = 2
        for word in doc_words:
            tf_cnt[word] += 1
        for word in doc_words:
            # further replacing the raw tf with the log tf
            X[i, word2ind[word]] = np.log(tf_cnt.get(word, 1)) * np.log((len(docs) + 1) / (df_cnt[word] + 0.5))
    return csr_matrix(X), y

X, y = generate_matrix_tfidf(docs, text_data, word2ind)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)

Training set acc: 1.0
Test set acc: 0.9834926151172894
Test set macro_f1: 0.9556515083909671
Test set micro_f1: 0.9834926151172894


### 2. Word2Vec

In [39]:
# the function for generating the embeddings of a single document
def doc2vec(doc, embed_dict):
    res = np.zeros(100)
    valid = len(doc)
    for w in doc:
        if w in embed_dict:
            res += embed_dict[w]
        else:
            valid -= 1
    res /= valid
    return res

# the function for generating X,y based on a given embedding dictionary
def generate_doc_embeddings(docs, text_data, embedding_dict):
    X, y = np.zeros([len(docs), 100]), np.zeros(len(docs))
    for i, doc_words in enumerate(docs):
        label = text_data[i][1]
        if label == 'sports':
            y[i] = 0
        elif label == 'business':
            y[i] = 1
        elif label == 'politics':
            y[i] = 2
        X[i, :] = doc2vec(doc_words, embedding_dict)
    return X, y


(a) (i) Pre-trained Glove embeddings:

In [None]:
def embedding_from_pretrain():
    embedding_dict = {}
    with open('glove.6B.100d.txt', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0].replace('<','').replace('>','')
            if word.isalpha() is False:
                continue
            vec = np.array(values[1:], dtype='float32')
            embedding_dict[word] = vec
    return embedding_dict

embedding_dict = embedding_from_pretrain()
X, y = generate_doc_embeddings(docs, text_data, embedding_dict)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)



Training set acc: 0.9817688551275094
Test set acc: 0.9748045178105995
Test set macro_f1: 0.9336304638041465
Test set micro_f1: 0.9748045178105995


(a)(ii) Train Word2Vec on AGNews

In [None]:
# Train on AGNews data using gensim.utility.simple_preprocess to tokenize

ag_data = list()
with open('ag.csv', 'r') as f:
    reader = csv.reader(f)
    # skip the header
    next(reader, None)
    for line in reader:
        ag_data.append(utils.simple_preprocess(line[0]))
        
ag_model = models.Word2Vec(ag_data, size=100, window=5, min_count=3, workers=4)
X, y = generate_doc_embeddings(docs, text_data, ag_model.wv)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)



Training set acc: 0.977319587628866
Test set acc: 0.9704604691572546
Test set macro_f1: 0.927292419840161
Test set micro_f1: 0.9704604691572546


In [42]:
# Train on AGNews data using my own text_clean function to tokenize

ag_data = list()
with open('ag.csv', 'r') as f:
    reader = csv.reader(f)
    # skip the header
    next(reader, None)
    for line in tqdm.tqdm(reader):
        # using my own text cleaning function to tokenize the ag data for training
        ag_data.append(text_clean(line[0]))
        
ag_model = models.Word2Vec(ag_data, size=100, window=5, min_count=3, workers=4)
X, y = generate_doc_embeddings(docs, text_data, ag_model.wv)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)

Training set acc: 0.9804666304937601
Test set acc: 0.9782797567332754
Test set macro_f1: 0.9421286536357574
Test set micro_f1: 0.9782797567332754


(a)(iii) Train Word2Vec on NYT

In [None]:
# Train on NYT data using gensim.utility.simple_preprocess to tokenize

nyt_data = list()
with open('nyt.csv', 'r') as f:
    reader = csv.reader(f)
    # skip the header
    next(reader, None)
    for line in reader:
        # using simple_preprocess function to tokenize
        nyt_data.append(utils.simple_preprocess(line[0]))
        
nyt_model = models.Word2Vec(nyt_data, size=100, window=5, min_count=3, workers=4)
X, y = generate_doc_embeddings(docs, text_data, nyt_model.wv)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)

Training set acc: 0.9817688551275094
Test set acc: 0.9800173761946134
Test set macro_f1: 0.9416040042790644
Test set micro_f1: 0.9800173761946134


In [None]:
# Train on NYT data using my own text_clean function to tokenize

nyt_model = models.Word2Vec(docs, size=100, window=5, min_count=3, workers=4)
X, y = generate_doc_embeddings(docs, text_data, nyt_model.wv)

y_pre, y_test = train_n_test(X, y)
print_evaluation(y_pre, y_test)

Training set acc: 0.9850244167118828
Test set acc: 0.9817549956559514
Test set macro_f1: 0.9501484403818173
Test set micro_f1: 0.9817549956559514


From the results above, I found that using my customized text_clean function will lead to a better performance than using the simple_preprocess function from gensim


(b) Cons of averaging word vectos
- The order of words is not preserved, which can be important
  - We can apply a RNN model like LSTM, or a transformer which employs position encodings to preserve the order information.
- Some words can be more important than others to the document category, but now they are getting the same weights.
  - Maybe we can derive weights based on the tf-idf values (which to some extend reflect the importance of a word in a document), and then use a weighted average to generate document embeddings.
  

### 3. BERT

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW
import torch
from torch.utils.data import Dataset, DataLoader

In [6]:
nyt_text, nyt_labels = list(), list()

with open('nyt.csv', 'r') as f:
    reader = csv.reader(f)
    # skip the header
    next(reader, None)
    for row in reader:
        nyt_text.append(row[0])
        if row[1] == 'sports':
            nyt_labels.append(0)
        elif row[1] == 'business':
            nyt_labels.append(1)
        elif row[1] == 'politics':
            nyt_labels.append(2)
            

inds = np.arange(len(nyt_text))
train_inds, left_inds, _, _ = train_test_split(np.array(inds), np.arange(len(nyt_text)), test_size=0.2, random_state=42)
val_inds, test_inds = left_inds[:int(left_inds.shape[0]/2)], left_inds[(int(left_inds.shape[0]/2)+1):]

X_train = [nyt_text[ind] for ind in train_inds]
y_train = [nyt_labels[ind] for ind in train_inds]
X_val = [nyt_text[ind] for ind in val_inds]
y_val = [nyt_labels[ind] for ind in val_inds]
X_test = [nyt_text[ind] for ind in test_inds]
y_test = [nyt_labels[ind] for ind in test_inds]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train_emb = tokenizer(X_train, max_length=64, truncation=True, padding='max_length')
X_val_emb = tokenizer(X_val, max_length=64, truncation=True, padding='max_length')
X_test_emb = tokenizer(X_test, max_length=64, truncation=True, padding='max_length')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
class NYTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NYTDataset(X_train_emb, y_train)
val_dataset = NYTDataset(X_val_emb, y_val)
test_dataset = NYTDataset(X_test_emb, y_test)

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in tqdm.tqdm(range(3)):
    for batch in tqdm.tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

model.save_pretrained('finetuned_models')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=576.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=576.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=576.0), HTML(value='')))





In [30]:
test_loader = DataLoader(test_dataset, batch_size=4)
y_pre = list()
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    pre = np.argmax(outputs[0].data.cpu().numpy(), axis=1)
    y_pre.extend(pre)

print_evaluation(np.array(y_pre), np.array(y_test))

Test set acc: 0.9713292788879235
Test set macro_f1: 0.9250162818573228
Test set micro_f1: 0.9713292788879235
