# Comment Similarity via Doc2Vec (`gensim`)

In [2]:
import os
import csv
import gensim
import random



## Loading training and test data
Load `comments.csv` file as it contains our crawled raw data. It is in cvs-format and contains one column `comment text`, which contain the comment text and therefore form our corpus. Each document gets a label assigned. The label is the respective comment ID (`comment id`). 

Preprocess data with `gensim` (resulting in a list of words) and split to train and test data.

In [24]:
class Preprocessor():
    def __init__(self, filename):
        self.filename = filename
        # setup csv reader
        csv.field_size_limit(1000000000)
        self.n_all_samples = 0
        self.train_indices = []
        self.test_indices = []
        self._is_splitting_setup = False
        
    def init_splitting(self, test_samples=0.2, random_seed=15):
        from numpy.random import RandomState
        
        # count corpus size
        with open(self.filename, 'rt', encoding="UTF-8", newline="") as file:
            for i, _ in enumerate(file, 1):
                pass
        self.n_all_samples = i
        # split into train and test
        random = RandomState(random_seed)
        n_samples = int(self.n_all_samples*test_samples)
        all_indices = set(range(0, self.n_all_samples))
        self.test_indices = set(random.randint(0, n_samples, size=(self.n_all_samples,)))
        self.train_indices = all_indices - self.test_indices
        self._is_splitting_setup = True
        print("corpus size:", self.n_all_samples)
        print("train corpus size:", len(self.train_indices))
        print("test corpus size:", len(self.test_indices))
    
    def save(self, filename):
        if not self._is_splitting_setup:
            raise IllegalArgumentException("No split indices available to save to disk")
        with open(filename, 'wt', encoding="UTF-8", newline="") as file:
            writer = csv.writer(file, delimiter=",")
            writer.writerow(self.train_indices)
            writer.writerow(self.test_indices)
        print("Wrote indices to disk:", filename)
    
    def load(self, filename):
        with open(filename, 'rt', encoding="UTF-8", newline="") as file:
            reader = csv.reader(file, delimiter=",")
            self.train_indices = next(reader)
            self.test_indices = next(reader)
        self.n_all_samples = len(self.train_indices) + len(self.test_indices)
        print("Loaded indices from disk:", filename)
    
    def read_corpus_with_indices(self, indices):
        with open(self.filename, 'rt', encoding="UTF-8", newline="") as file:
            reader = csv.reader(file, delimiter=",")
            next(reader) #skipping header
            for i, row in enumerate(reader, 0):
                if not self._is_splitting_setup:
                    yield gensim.models.doc2vec.TaggedDocument(
                        gensim.utils.simple_preprocess(row[3].replace("\\n", " ")), 
                        [int(row[0])]
                    )
                else:
                    if i in indices:
                        yield gensim.models.doc2vec.TaggedDocument(
                            gensim.utils.simple_preprocess(row[3].replace("\\n", " ")), 
                            [int(row[0])]
                        )
                    else:
                        continue

    def read_corpus_with_id(self, cid):
        with open(self.filename, 'rt', encoding="UTF-8", newline="") as file:
            reader = csv.reader(file, delimiter=",")
            next(reader) #skipping header
            for i, row in enumerate(reader, 0):
                if int(row[0]) in cid:
                    yield gensim.models.doc2vec.TaggedDocument(
                        gensim.utils.simple_preprocess(row[3].replace("\\n", " ")), 
                        [int(row[0])]
                    )
                else:
                    continue

    def __iter__(self):
        return self.read_corpus_with_indices(self.train_indices)

## Train model

In [25]:
filename_source = os.path.join("..", "data", "comments.data")
filename_model = "word2vec_2.model"
filename_indices = "indices_2.model"
size = 50
min_count = 10
iterations = 10
workers = 4

Parameter:
- **`size` layer size**
  
  This parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training algorithm has. Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds.
  
- **`min_count` only use words that occur min. two times:**
  
  It's for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them.

- **`iter` number of training iterations on the dataset**
  
- **`workers` number of parallel worker threats**

In [26]:
preprocessor = Preprocessor(filename_source)
preprocessor.init_splitting(test_samples=0.05, random_seed=15)

model = gensim.models.doc2vec.Doc2Vec(size=size, min_count=min_count, iter=iterations, workers=workers)
print("Building vocabulary")
%time model.build_vocab(preprocessor)
print("Training model")
%time model.train(preprocessor, total_examples=model.corpus_count, epochs=model.iter)

# save model to disk
model.save(filename_model)
preprocessor.save(filename_indices)
# trim memory usage:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
print("Model saved to disk; Pruned in-memory model")

corpus size: 5608810
train corpus size: 5328370
test corpus size: 280440
Building vocabulary
Wall time: 5min 5s
Training model
Wall time: 1h 16min 16s
Wrote indices to disk: indices_2.model
Model saved to disk; Pruned in-memory model


## Test model

In [34]:
# Load model
#model = gensim.models.doc2vec.Doc2Vec.load(filename_model)
# Load preprocessor
#preprocessor = Preprocessor(filename_source)
#preprocessor.load(filename_indices)

# Pick a random document from the test corpus and infer a vector from the model
doc_idx = random.choice(list(preprocessor.test_indices))
doc = next(preprocessor.read_corpus_with_indices([doc_idx]))
inferred_vector = model.infer_vector(doc.words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('{}. Test Document ({}): «{}»\n'.format(doc_idx, doc.tags[0], ' '.join(doc.words)))
print('SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print('%s %s: «%s»\n' % (label, sims[index], ' '.join(next(preprocessor.read_corpus_with_id([sims[index][0]])).words)))

44385. Test Document (44385): «thanks ed another few thousand yes votes now could we have speech about new tartan curtain tearing loving families apart forever cheers»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc10,s0.001,t4):

MOST (3960056, 0.751582145690918): «sorry but with corbyn as labour leader there is no opposition party many thanks to corbyn supporters for allowing the tories free run at the next ge the many homeless disabled and vulnerable people will thank you for it»

SECOND MOST (447479, 0.7420821189880371): «comrade corbyn and ira lover mcdonnel are the lords of misrule as labour continues to disintegrate please god let them labour to defeat in»

MEDIAN (4438348, 0.35840940475463867): «negotiate from position of strength one can only imagine the excuses people would make as eu would take more from there sovereign nations without the say of the people to stop them oh wait we can greece spain portugal ireland feel the power of vote you loose that or degrade

Testing word vectors:

In [28]:
model.wv.most_similar(positive=["trump", "usa"], negative=["putin"])

[('america', 0.6986275911331177),
 ('china', 0.6789929270744324),
 ('germany', 0.6717723608016968),
 ('mexico', 0.6707333326339722),
 ('congress', 0.6639487743377686),
 ('japan', 0.6501384377479553),
 ('uk', 0.6442731022834778),
 ('us', 0.6404613256454468),
 ('tpp', 0.6269809007644653),
 ('france', 0.6255526542663574)]

In [29]:
similarities = [
    ("trump", "putin"),
    ("trump", "clinton"),
    ("trump", "president"),
    ("president", "safety"),
    ("election", "fun"),
    ("election", "but")
]
for w1, w2 in similarities:
    print("similarity {} - {}: {}".format(w1, w2, model.wv.similarity(w1, w2)))

similarity trump - putin: 0.8540221587833772
similarity trump - clinton: 0.7710283588666192
similarity trump - president: 0.5690041895616303
similarity president - safety: 0.013856460909897117
similarity election - fun: 0.1425054690336343
similarity election - but: 0.365167402266884


In [32]:
model.wv.most_similar(positive=["malaria", "disease"], negative=["penicillin"])

[('ebola', 0.6875036954879761),
 ('infection', 0.6856157779693604),
 ('desease', 0.6590758562088013),
 ('diseases', 0.6472394466400146),
 ('obesity', 0.6398852467536926),
 ('cancer', 0.6353930234909058),
 ('infections', 0.6333891153335571),
 ('illness', 0.628749668598175),
 ('tb', 0.6269285082817078),
 ('alcoholism', 0.612775444984436)]