# Generating training set

In [1]:
import sys
sys.path.append('../TTI/')
%load_ext autoreload
%autoreload 2

We need to first import the graph of categories.

In [2]:
from TTI.CategoriesGraph import CategoriesGraph

graph = CategoriesGraph()
categories = graph.categories


Reading topics graph


For every category we need to find `s` words in the neighbourshood nodes. Let's how it works with the example category "Machine learning algorithms".

In [3]:
from TTI.TrainingSet import getNodeWordSet

s = 50
wordRepresentation = getNodeWordSet("Category:Machine learning algorithms", graph, numberOfWords=s, debug=True)

Currently visiting: Machine learning algorithms
Currently visiting: Algorithms
Currently visiting: Machine learning
Currently visiting: Applied mathematics
Currently visiting: Computer algebra
Currently visiting: Algorithm description languages
Currently visiting: Algorithmic trading
Currently visiting: Approximation algorithms
Currently visiting: Behavior selection algorithms
Currently visiting: Bioinformatics algorithms
Currently visiting: Calendar algorithms
Currently visiting: Checksum algorithms
Currently visiting: Combinatorial algorithms
Currently visiting: Compression algorithms
Currently visiting: Computer arithmetic algorithms
Currently visiting: Concurrent algorithms
Currently visiting: Cryptographic algorithms
Currently visiting: Data mining algorithms
Currently visiting: Database algorithms
Currently visiting: Digit-by-digit algorithms
Currently visiting: Digital signal processing
Currently visiting: Distributed algorithms
Currently visiting: Error detection and correction

In [4]:
print("Found {} words:\n\n".format(len(wordRepresentation)), wordRepresentation)

Found 50 words:

 ['algorithmic', 'combinatorial', 'digit-by-digit', 'external', 'error', 'fingerprinting', 'machine', 'mathematics', 'algebra', 'compression', 'algorithm', 'heuristic', 'and', 'mining', 'database', 'languages', 'distributed', 'processing', 'algorithms', 'protocols', 'group', 'line', 'clipping', 'division', 'signal', 'behavior', 'bioinformatics', 'checksum', 'arithmetic', 'computational', 'applied', 'evolutionary', 'graph', 'detection', 'description', 'digital', 'selection', 'theory', 'approximation', 'calendar', 'concurrent', 'memory', 'correction', 'cryptographic', 'learning', 'data', 'computer', 'trading', 'matrix', 'fair']


Now we can use our `doc2vec` model to encode our set of words as a numeric vector.

In [5]:
from TTI.doc2vec import encode_article 

print("Encoded numeric vector:\n", encode_article("".join(wordRepresentation)))

Encoded numeric vector:
 [ 1.57011941e-03 -1.14319194e-03 -7.50709849e-04  3.70273629e-04
  1.54556090e-03  9.03662702e-04  1.00285048e-03  6.65366126e-04
  8.41695233e-04  1.15689379e-03  6.29762653e-04 -4.24629543e-04
 -4.20738739e-04  7.59956311e-04  1.28291699e-03  1.15829415e-03
 -2.90239841e-04 -6.72268390e-04 -8.38602195e-04 -1.22561643e-03
  1.48316589e-03 -1.57441816e-03 -1.21585195e-04  9.09041672e-04
  4.52965993e-04 -2.93156863e-05  1.32239284e-03  7.16225943e-04
 -9.27706715e-04  4.09226282e-04  4.44408885e-04  1.57303584e-03
 -8.31634330e-04  1.45097321e-03 -1.18715968e-03 -1.62772811e-03
  8.43850896e-04  1.20600371e-03  1.46042637e-03 -1.53379852e-03
 -1.65229791e-03  4.15280694e-04  4.11874731e-04 -6.86984451e-04
  9.05374472e-04 -1.36414135e-04 -9.55812517e-04  1.26046571e-03
 -1.05738733e-03  1.52708008e-03 -1.42025715e-03 -1.04907656e-03
  1.44765759e-03 -4.93587984e-04  1.44493370e-03  3.91225767e-04
 -1.04183084e-04 -1.43711281e-03 -5.27591270e-04 -1.31794158e-03


Now for every class in categories graph I need to calculate the words represntation set.

In [22]:
import tqdm
import pandas as pd

data = []
wordsCount = 50

for category in tqdm.tqdm(categories):
    nodeId = "Category:{}".format(category)
    wordRepresentation = getNodeWordSet(nodeId, graph, numberOfWords=wordsCount)
    numericVector = encode_article("".join(wordRepresentation))
    data.append([nodeId, wordRepresentation, numericVector])

dataFrame = pd.DataFrame(data, columns=['Category', 'Words', 'Representation'])


100%|██████████| 10/10 [00:00<00:00, 109.01it/s]


In [18]:
dataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[subfields, main, mathematics, main_topic_clas...","[7.1092065e-05, 0.0015041557, 0.0012917432, 0...."
1,Category:Main topic articles,"[subfields, main, mathematics, main_topic_clas...","[7.1092065e-05, 0.0015041557, 0.0012917432, 0...."
2,Category:Academic disciplines,"[subfields, main, arts, academics, biblical, m...","[0.0011213522, -3.3170538e-05, 0.0010152729, 0..."
3,Category:Subfields by academic discipline,"[archaeological, meteorology, subfields, seism...","[-0.0016465506, -0.0006757527, 0.0015840229, 0..."
4,Category:Scholars by subfield,"[subfields, violence, archaeologists, bioinorg...","[-0.0007645557, -0.0009938681, 0.0014720332, -..."


In [23]:
from TTI.config import DATA_DIR
import os

fileName= "training_set_{}.csv".format(wordsCount)
filePath = os.path.join(DATA_DIR, fileName)

dataFrame.to_csv(filePath,index_label="id")