In [1]:
import numpy as np
import spacy
import re
from fractions import Fraction

nlp = spacy.load('en')

# Google News Trained Word Vectors

#### Load in the model

In [2]:
import gensim

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    '/Users/davischum/data/GoogleNews-vectors-negative300.bin', 
    binary=True
)  

#### Load in the text

In [4]:
from sklearn.datasets import load_files

In [5]:
data_path = '/Users/davischum/data/bbc/'
news = load_files(
    data_path, 
    encoding='utf-8',
    decode_error='ignore',
    random_state=42
)

#### Tokenize and remove punctuation

In [6]:
def tokenize(text_data):
    clean_text = list()
    for text_chunk in text_data:
        doc = nlp(text_chunk)
        clean = ' '.join([token.text for token in doc])
        clean = re.sub('[\\n\.\,\(\)\'\"\-\:\;\&\#\/0-9]', '', clean)
        clean_text.append(clean)
    return clean_text

In [7]:
news.data = tokenize(news.data)

#### Find the unique words in the corpus

In [8]:
def find_unique_words(text:list):
    word_set = set()
    for word_list in text:
        word_set = word_set | set(word_list)
    return word_set

In [9]:
unq_words = find_unique_words([txt.split() for txt in news.data])

#### See how many match up with the model vocabulary

In [10]:
vocab = set(model.vocab.keys())

In [11]:
matched_vocab = unq_words & vocab

In [25]:
print(Fraction(len(matched_vocab), len(unq_words)))

5197/5358


#### Convert words to embeddings

In [26]:
embed_dict = {word:model.get_vector(word) for word in matched_vocab}

In [27]:
text_embeddings = list()
for article in news.data:
    text_embeddings.append(
        np.array([embed_dict[word] for word in article.strip().split() if word in matched_vocab])
    )

In [28]:
np.save('./bbc_data/tmp/bbc_text_embed_converted.npy', text_embeddings)

#### Find the minimum word article

In [29]:
text_embeddings = np.load('./bbc_data/tmp/bbc_text_embed_converted.npy')

In [30]:
min_length = np.min([article.shape[0] for article in text_embeddings])

In [31]:
text_embeddings = [article[:min_length] for article in text_embeddings]

## Model time

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

We can average the word embeddings to give a notion of meaning to each article

In [33]:
def create_average_vectors(embedded_text):
    "Calculate the average of the word embeddings for each text in a corpus"
    avg_embed_text = list()
    for article in text_embeddings:
        avg_embed_text.append(np.sum([word for word in article], axis=0) / len(article))
    return np.array(avg_embed_text)

In [34]:
avg_embed_text = create_average_vectors(text_embeddings)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(avg_embed_text, news.target, random_state=42)

#### Naive Bayes

Naive Bayes requires non-negative values, so we must add the global minimum as the baseline value to ensure we only have values greater than or equal to zero

In [36]:
def make_non_negative(train_data, test_data):
    global_min = np.min(np.concatenate((train_data, test_data)))
    return train_data + abs(global_min), test_data + abs(global_min)

In [37]:
X_train_nn, X_test_nn = make_non_negative(X_train, X_test)

In [38]:
nb = MultinomialNB()
nb.fit(X_train_nn, y_train)
nb_preds = nb.predict(X_test_nn)

nb_accuracy = np.mean(nb_preds==y_test)
f'Naive Bayes Accuracy: {nb_accuracy*100:.2f}%'

'Naive Bayes Accuracy: 73.25%'

#### Logistic Regression

In [39]:
lr = LogisticRegression(C=1000)

In [40]:
lr.fit(X_train, y_train)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
lr_preds = lr.predict(X_test)

In [42]:
lr_accuracy = np.mean(lr_preds==y_test)
f'Logistic Regression Accuracy: {lr_accuracy*100:.2f}%'

'Logistic Regression Accuracy: 97.13%'