# Generating training set

In [1]:
import sys
sys.path.append('../TTI/')
%load_ext autoreload
%autoreload 2

We need to first import the graph of categories.

In [2]:
from TTI.CategoriesGraph import CategoriesGraph

graph = CategoriesGraph()
categories = graph.categories


Reading topics graph


For every category we need to find `s` words in the neighbourshood nodes. Let's how it works with the example category "Machine learning algorithms".

In [10]:
from TTI.TrainingSet import getNodeWordSet

s = 25
wordRepresentation = getNodeWordSet("Category:Machine learning algorithms", graph, numberOfWords=s, debug=True)

Currently visiting: Machine learning algorithms
Currently visiting: Algorithms
Currently visiting: Machine learning
Currently visiting: Applied mathematics
Currently visiting: Computer algebra
Currently visiting: Algorithm description languages
Currently visiting: Algorithmic trading
Currently visiting: Approximation algorithms
Currently visiting: Behavior selection algorithms
Currently visiting: Bioinformatics algorithms
Currently visiting: Calendar algorithms
Currently visiting: Checksum algorithms
Currently visiting: Combinatorial algorithms
Currently visiting: Compression algorithms
Currently visiting: Computer arithmetic algorithms
Currently visiting: Concurrent algorithms
Currently visiting: Cryptographic algorithms
Currently visiting: Data mining algorithms
Currently visiting: Database algorithms
Currently visiting: Digit-by-digit algorithms
Currently visiting: Digital signal processing
Currently visiting: Distributed algorithms
Currently visiting: Error detection and correction

In [11]:
print("Found {} words:\n\n".format(len(wordRepresentation)), wordRepresentation)

Found 50 words:

 ['algorithm', 'arithmetic', 'selection', 'division', 'processing', 'detection', 'data', 'computer', 'cryptographic', 'protocols', 'mathematics', 'fingerprinting', 'compression', 'heuristic', 'machine', 'theory', 'mining', 'applied', 'algebra', 'distributed', 'checksum', 'approximation', 'error', 'learning', 'calendar', 'correction', 'clipping', 'behavior', 'matrix', 'bioinformatics', 'memory', 'fair', 'algorithms', 'combinatorial', 'digital', 'trading', 'graph', 'languages', 'description', 'algorithmic', 'multiplication', 'database', 'computational', 'concurrent', 'evolutionary', 'group', 'signal', 'external', 'digit-by-digit', 'line']


Now we can use our `doc2vec` model to encode our set of words as a numeric vector.

In [12]:
from TTI.doc2vec import encode_article 

print("Encoded numeric vector:\n", encode_article("".join(wordRepresentation)))

Encoded numeric vector:
 [-0.000623344152700156, -0.0010207833256572485, 8.795347821433097e-05, -0.0013246862217783928, -0.0015699148643761873, 0.0005437639774754643, 5.157524810783798e-06, 0.0008592898957431316, 0.0015347463777288795, -0.0010505293030291796, -0.0010500915814191103, -0.00023540115216746926, 0.00160161720123142, 0.0015982070472091436, -0.0006845209863968194, 0.0008842069073580205, -0.0015262473607435822, 0.0002945506712421775, -0.0009485948248766363, 0.0012103100307285786, -0.0003779845137614757, -0.0012821437558159232, -0.0015727011486887932, -0.0007104766555130482, -0.0003301545511931181, 0.0012586534721776843, -0.000768120342399925, 0.00044779485324397683, -0.001178279984742403, -0.0012921225279569626, -0.00026780238840729, -0.00029388442635536194, 0.00018687424017116427, 0.0015169637044891715, 0.0012245847610756755, -0.0009244871325790882, 0.00028595805633813143, 0.001045495504513383, 0.0012685141991823912, -0.0004194669600110501, -0.0003131949924863875, 0.000778223

Now for every class in categories graph I need to calculate the words represntation set.

In [6]:
import tqdm
import pandas as pd
import json

data = []
wordsCount = 50

for category in tqdm.tqdm(categories):
    nodeId = "Category:{}".format(category)
    wordRepresentation = getNodeWordSet(nodeId, graph, numberOfWords=wordsCount)
    numericVector = encode_article("".join(wordRepresentation))

    data.append([str(nodeId), json.dumps(wordRepresentation), json.dumps(numericVector)])

dataFrame = pd.DataFrame(data, columns=['Category', 'Words', 'Representation'])


100%|██████████| 225765/225765 [49:18<00:00, 76.31it/s]


In [7]:
dataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
1,Category:Main topic articles,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
2,Category:Academic disciplines,"[""communication"", ""articles"", ""subject"", ""scie...","[-0.0007150950841605663, 0.0004406595544423908..."
3,Category:Subfields by academic discipline,"[""areas"", ""fields"", ""information"", ""subject"", ...","[0.001037890324369073, 0.00026856939075514674,..."
4,Category:Scholars by subfield,"[""gun"", ""anthropologists"", ""cartographers"", ""s...","[0.0008707719971425831, 0.0003855710383504629,..."


We won't use csv for it cause it would be to big and slow.

In [8]:
# from TTI.config import DATA_DIR
# import os

# fileName= "training_set_{}.csv".format(wordsCount)
# filePath = os.path.join(DATA_DIR, fileName)

# dataFrame.to_csv(filePath,index_label="id")

## Saving dataset in the database

Because caluclated dataset is huge (1.3GB) the `csv` file won't be convenient. I will use SQL database insetead.

In [8]:
from TTI.config import DATABASE_PATH
import sqlite3
import os

connection = sqlite3.connect(DATABASE_PATH)

In [9]:
table_name = "training_set_{}".format(wordsCount)

dataFrame.to_sql(table_name, connection, if_exists='replace', index=False)

## Testing import data from database

In [11]:
 importedDataFrame = pd.read_sql('select * from {}'.format(table_name), connection)

In [15]:
importedDataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
1,Category:Main topic articles,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
2,Category:Academic disciplines,"[""history"", ""communication"", ""professional"", ""...","[0.0013619071105495095, 0.0009967258665710688,..."
3,Category:Subfields by academic discipline,"[""of"", ""engineering"", ""psychiatric"", ""history""...","[-0.000537737796548754, 0.0015615752199664712,..."
4,Category:Scholars by subfield,"[""of"", ""engineers"", ""geographers"", ""gun"", ""eur...","[-0.00039383742841891944, -0.00025646726135164..."
