In [1]:
import os

# Get list of authors
train_directory = './C50/C50train'
test_directory = './C50/C50test'
authors = set()
for filename in os.listdir(train_directory):
    authors.add(filename)
authors.remove(".DS_Store")
authors = sorted(list(authors))
print(len(authors))

50


In [2]:
# train and test are dictionaries
# keys are the 10 authors
# values are the 50 texts written by the author
train = {}
test = {}
for author in authors:
    train[author] = []
    test[author] = []
for author in authors:
    for filename in os.listdir(train_directory + "/" + author):
        f = open(train_directory + "/" + author + "/" + filename, "r")
        train[author].append(f.read())
        f.close()
    for filename in os.listdir(test_directory + "/" + author):
        f = open(test_directory + "/" + author + "/" + filename, "r")
        test[author].append(f.read())
        f.close()
print(len(train[authors[0]]))
print(len(test[authors[0]]))

50
50


In [3]:
# reorganize data
for author in authors:
    train[author].extend(test[author][:40])
    test[author] = test[author][40:]

In [4]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
splitter = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = nltk.tokenize.TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

def tokenise(text):
    ret = []
    sentences = splitter.tokenize(text)
    for sentence in sentences:
        token_list = tokenizer.tokenize(sentence)
        for token in token_list:
            ret.append(token.lower())
    return ret

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN

def process_text(text):
    token_list = tokenise(text)
    # Pos Tag
    pos = nltk.pos_tag(token_list)
    # Lemmatize
    return [lemmatizer.lemmatize(word,get_wordnet_pos(pos_tag)) for (word,pos_tag) in pos]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erictay1997/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/erictay1997/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
def tokenise_2(text):
    sentences = splitter.tokenize(text)
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    for tokenized_sentence in tokenized_sentences:
        for i in range(len(tokenized_sentence)):
            tokenized_sentence[i] = tokenized_sentence[i].lower()
    return tokenized_sentences

a = tokenise_2("i Am hEre. no you")

def process_text_2(text):
    list_of_token_lists = tokenise_2(text)
    for i in range(len(list_of_token_lists)):
        token_list = list_of_token_lists[i]
        # Pos Tag
        pos = nltk.pos_tag(token_list)
        # Lemmatize
        list_of_token_lists[i] = [lemmatizer.lemmatize(word,get_wordnet_pos(pos_tag)) for (word,pos_tag) in pos]
    return list_of_token_lists

In [6]:
train_processed = {}
test_processed = {}
for i in range(len(authors)):
    author = authors[i]
    print(i)
    train_processed[author] = [process_text(text) for text in train[author]]
    test_processed[author] = [process_text(text) for text in test[author]]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [7]:
# train and test are dictionaries
# keys are the 50 authors
# values are 50 token lists written by the author
train_processed_2 = {}
test_processed_2 = {}
for i in range(len(authors)):
    author = authors[i]
    print(i)
    train_processed_2[author] = [process_text_2(text) for text in train[author]]
    test_processed_2[author] = [process_text_2(text) for text in test[author]]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [8]:
from collections import Counter
author_dicts = {}
vocab = set()
for author in authors:
    author_dicts[author] = Counter()
    for document in train_processed_2[author]:
        for sentence in document:
            author_dicts[author] += Counter(sentence)
    vocab.update(author_dicts[author].keys())

In [9]:
word_counts = {}
for word in vocab:
    counter = 0
    for author in authors:
        if word in author_dicts[author]:
            counter += 1
    word_counts[word] = counter

In [10]:
import numpy as np
idf = {}
for word in vocab:
    idf[word] = np.log(len(authors)/word_counts[word])

In [11]:
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [12]:
import gensim.downloader
# glove_vectors = gensim.downloader.load('glove-twitter-50')
# or 
# this performs better
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')
glove_vectors["."]

array([-1.2559e-01,  1.3630e-02,  1.0306e-01, -1.0123e-01,  9.8128e-02,
        1.3627e-01, -1.0721e-01,  2.3697e-01,  3.2870e-01, -1.6785e+00,
        2.2393e-01,  1.2409e-01, -8.6708e-02,  3.3010e-01,  3.4375e-01,
       -8.7582e-04, -2.9658e-01,  2.4417e-01, -1.1592e-01, -3.5742e-02,
       -1.0830e-02,  2.0776e-01,  2.9285e-01, -7.3491e-02, -1.8598e-01,
       -2.0090e-01, -9.5366e-02,  6.3732e-03, -1.3620e-01,  9.2028e-02,
       -3.9957e-02,  1.9027e-01, -1.0456e-01,  2.7670e-03, -7.1742e-01,
       -1.2915e-01, -1.3451e-03,  2.7002e-01, -5.3023e-02,  2.2148e-01,
        1.3881e-01, -1.5051e-01, -1.9150e-01,  1.6402e-01,  9.7484e-02,
        5.6841e-02,  3.9789e-01,  4.0725e-01,  1.4802e-01,  2.1569e-01,
       -1.0671e-01, -1.0232e-01,  2.4810e-02, -2.2100e-01, -1.0720e-02,
        1.4234e-01, -2.8242e-01,  1.9254e-01,  8.6720e-02, -3.8970e-01,
        1.1321e-01,  1.3779e-03,  6.4009e-03, -1.6206e-01, -8.2153e-02,
       -5.5397e-01,  3.6789e-01, -4.0159e-03,  2.0710e-01, -3.71

In [13]:
w2v_averaged = {}
for author in authors:
    w2v_averaged[author] = []
    for document in train_processed_2[author]:
        w2v_document_averaged = []
        for sentence in document:
            w2v_sentence = [glove_vectors[word] for word in sentence if word in glove_vectors]
            if len(w2v_sentence) == 0:
                continue
            weights = [idf[word] for word in sentence if word in glove_vectors]
            if sum(weights) != 0:
                w2v_sentence_averaged = np.zeros(len(w2v_sentence[0]))
                for i in range(len(weights)):
                    w2v_sentence_averaged += w2v_sentence[i]*weights[i]
                w2v_sentence_averaged /= sum(weights)
                w2v_sentence_averaged = w2v_sentence_averaged.astype(np.float32)
            else:
                w2v_sentence_averaged = np.mean(np.array(w2v_sentence), axis=0)
            w2v_document_averaged.append(w2v_sentence_averaged)
        w2v_averaged[author].append(w2v_document_averaged)

In [14]:
def convert_document_to_torch_input(document):
    inputs = [torch.from_numpy(sentence).view(1,-1) for sentence in document]
    return torch.cat(inputs).view(len(inputs), 1, -1)
inputs = convert_document_to_torch_input(w2v_averaged[authors[0]][0])
inputs

tensor([[[ 2.8273e-01,  2.2682e-01, -1.2728e-01,  6.6909e-02, -1.4604e-01,
          -3.7018e-01, -7.4869e-02,  1.9472e-01,  1.5710e-01, -1.2026e+00,
          -1.0626e-01,  1.9471e-01, -7.0041e-02, -1.2315e-01,  1.5450e-02,
           2.2973e-03,  1.0530e-01,  1.1025e-01, -4.4617e-01, -3.8690e-02,
           1.9775e-01, -6.3580e-02, -1.4941e-02, -1.4927e-01, -3.6061e-01,
           2.2657e-01,  1.1178e-02,  6.7852e-02, -2.5396e-01,  1.8218e-01,
          -1.9957e-01, -6.6664e-02,  1.8566e-01, -1.3137e-02, -3.9887e-01,
           4.1236e-02, -1.7729e-01, -3.4348e-01, -6.5909e-02, -1.0181e-02,
           6.7758e-02, -8.6587e-04, -1.9311e-02,  2.0790e-01, -1.2075e-01,
          -6.7033e-02, -1.1569e-01, -1.9777e-01,  3.7347e-02, -6.0328e-02,
          -4.0233e-03, -2.1942e-01,  2.5179e-01, -9.9700e-02, -7.2769e-02,
          -6.2959e-02, -4.5002e-01, -1.4022e-02, -1.0687e-01,  2.0753e-01,
           2.4651e-01,  8.4799e-03, -2.9299e-01, -5.2405e-03, -2.2577e-02,
          -5.8664e-02, -1

In [15]:
class FindAuthor(nn.Module):
    def __init__(self):
        super(FindAuthor, self).__init__()
        self.GRU = nn.GRU(300, 200, bias = False)     
        self.conv = nn.Conv1d(1, 1, kernel_size = 1, stride = 2, bias = False)
        self.linear = nn.Linear(100,50)
        
    def forward(self, sentence_input):
        out, hidden = self.GRU(sentence_input)
        
        conv_out = self.conv(out)
        pooled_output = torch.mean(conv_out, 0, True)
        probs = self.linear(pooled_output)
        return F.log_softmax(probs, dim=2).view(1,50)

In [16]:
net = FindAuthor()
net(inputs)

tensor([[-3.8145, -3.8774, -3.8813, -3.8179, -3.9542, -3.8930, -3.8291, -3.8602,
         -4.0140, -3.8495, -3.8965, -3.9199, -3.9147, -4.0094, -3.8922, -3.8874,
         -3.7946, -3.9366, -3.9904, -3.9628, -3.9095, -3.9512, -3.8857, -3.9454,
         -3.8606, -3.8871, -3.8994, -4.0077, -3.9541, -3.8839, -3.9279, -3.9502,
         -3.8718, -3.9030, -3.8406, -3.9147, -3.9262, -3.9685, -3.9843, -3.9457,
         -3.9211, -3.9020, -3.9931, -3.9740, -4.0628, -3.9456, -3.9573, -3.8247,
         -3.8648, -3.8276]], grad_fn=<ViewBackward>)

In [17]:
x = np.array([x for x in range(50)])
targets = torch.from_numpy(np.repeat(x, [90 for y in range(50)], axis=0))

In [18]:
training_data = []
for author in authors:
    for document in w2v_averaged[author]:
        training_data.append(convert_document_to_torch_input(document))

In [19]:
jumbled_training_data = []
for i in range(len(w2v_averaged[author])):
    for author in authors:
        jumbled_training_data.append(convert_document_to_torch_input(w2v_averaged[author][i]))
jumbled_targets = torch.tensor([x for x in range(50)]*90)

In [20]:
# mini-batch gradient descent
torch.manual_seed(0)
model = FindAuthor()
loss_function = nn.NLLLoss()
optimizer = optim.Adadelta(model.parameters(), lr=1.0, rho=0.9, eps=1e-06, weight_decay=0)

for epoch in range(300):  
    tags = []
    for i in range(len(jumbled_training_data)):
        model.zero_grad()
        document = jumbled_training_data[i]
        tags.append(model(document))
        if i % 50 == 49:
            tag_scores = torch.cat(tags)
            loss = loss_function(tag_scores, jumbled_targets[:50])
            loss.backward()
            optimizer.step()
            tags = []
    print(loss)
    if loss < 0.001:
        print(epoch)
        break

tensor(1.2431, grad_fn=<NllLossBackward>)
tensor(0.7472, grad_fn=<NllLossBackward>)
tensor(0.5412, grad_fn=<NllLossBackward>)
tensor(0.4142, grad_fn=<NllLossBackward>)
tensor(0.3519, grad_fn=<NllLossBackward>)
tensor(0.3100, grad_fn=<NllLossBackward>)
tensor(0.2576, grad_fn=<NllLossBackward>)
tensor(0.1830, grad_fn=<NllLossBackward>)
tensor(0.1046, grad_fn=<NllLossBackward>)
tensor(0.1006, grad_fn=<NllLossBackward>)
tensor(0.0707, grad_fn=<NllLossBackward>)
tensor(0.1119, grad_fn=<NllLossBackward>)
tensor(0.0396, grad_fn=<NllLossBackward>)
tensor(0.1250, grad_fn=<NllLossBackward>)
tensor(0.1361, grad_fn=<NllLossBackward>)
tensor(0.0282, grad_fn=<NllLossBackward>)
tensor(0.0767, grad_fn=<NllLossBackward>)
tensor(0.1193, grad_fn=<NllLossBackward>)
tensor(0.0062, grad_fn=<NllLossBackward>)
tensor(0.0234, grad_fn=<NllLossBackward>)
tensor(0.0077, grad_fn=<NllLossBackward>)
tensor(0.0029, grad_fn=<NllLossBackward>)
tensor(0.0046, grad_fn=<NllLossBackward>)
tensor(0.0008, grad_fn=<NllLossBac

In [21]:
training_predictions = [np.argmax(model(document).view(-1).detach().numpy()) for document in training_data]
print("Training Accuracy: {}%".format(100*np.mean(targets.detach().numpy() == training_predictions)))

Training Accuracy: 99.53333333333333%


In [22]:
w2v_averaged_test = {}
for author in authors:
    w2v_averaged_test[author] = []
    for document in test_processed_2[author]:
        w2v_document_averaged = []
        for sentence in document:
            w2v_sentence = [glove_vectors[word] for word in sentence if word in glove_vectors and word in vocab]
            if len(w2v_sentence) == 0:
                continue
            weights = [idf[word] for word in sentence if word in glove_vectors and word in vocab]
            if sum(weights) != 0:
                w2v_sentence_averaged = np.zeros(len(w2v_sentence[0]))
                for i in range(len(weights)):
                    w2v_sentence_averaged += w2v_sentence[i]*weights[i]
                w2v_sentence_averaged /= sum(weights)
                w2v_sentence_averaged = w2v_sentence_averaged.astype(np.float32)
            else:
                w2v_sentence_averaged = np.mean(np.array(w2v_sentence), axis=0)
            w2v_document_averaged.append(w2v_sentence_averaged)
        w2v_averaged_test[author].append(w2v_document_averaged)

In [23]:
testing_data = []
for author in authors:
    for document in w2v_averaged_test[author]:
        testing_data.append(convert_document_to_torch_input(document))

In [24]:
testing_predictions = [np.argmax(model(document).view(-1).detach().numpy()) for document in testing_data]
print("Testing Accuracy: {}%".format(100*np.mean(np.repeat(x, [10 for y in range(50)], axis=0) == testing_predictions)))

Testing Accuracy: 78.0%
