# Generating training set

In [1]:
import sys
sys.path.append('../TTI/')
%load_ext autoreload
%autoreload 2

We need to first import the graph of categories.

In [2]:
from TTI.CategoriesGraph import CategoriesGraph

graph = CategoriesGraph()
categories = graph.categories


Reading topics graph


For every category we need to find `s` words in the neighbourshood nodes. Let's how it works with the example category "Machine learning algorithms".

In [3]:
from TTI.TrainingSet import getNodeWordSet

s = 50
wordRepresentation = getNodeWordSet("Category:Machine learning algorithms", graph, numberOfWords=s, debug=True)

Currently visiting: Machine learning algorithms
Currently visiting: Algorithms
Currently visiting: Machine learning
Currently visiting: Applied mathematics
Currently visiting: Computer algebra
Currently visiting: Algorithm description languages
Currently visiting: Algorithmic trading
Currently visiting: Approximation algorithms
Currently visiting: Behavior selection algorithms
Currently visiting: Bioinformatics algorithms
Currently visiting: Calendar algorithms
Currently visiting: Checksum algorithms
Currently visiting: Combinatorial algorithms
Currently visiting: Compression algorithms
Currently visiting: Computer arithmetic algorithms
Currently visiting: Concurrent algorithms
Currently visiting: Cryptographic algorithms
Currently visiting: Data mining algorithms
Currently visiting: Database algorithms
Currently visiting: Digit-by-digit algorithms
Currently visiting: Digital signal processing
Currently visiting: Distributed algorithms
Currently visiting: Error detection and correction

In [4]:
print("Found {} words:\n\n".format(len(wordRepresentation)), wordRepresentation)

Found 50 words:

 ['digit-by-digit', 'fingerprinting', 'distributed', 'compression', 'database', 'theory', 'calendar', 'processing', 'fair', 'detection', 'description', 'division', 'cryptographic', 'evolutionary', 'computer', 'behavior', 'bioinformatics', 'selection', 'matrix', 'heuristic', 'algorithms', 'algebra', 'memory', 'correction', 'mathematics', 'languages', 'trading', 'arithmetic', 'clipping', 'applied', 'algorithmic', 'approximation', 'combinatorial', 'concurrent', 'data', 'external', 'mining', 'line', 'and', 'computational', 'machine', 'graph', 'learning', 'algorithm', 'group', 'digital', 'signal', 'error', 'protocols', 'checksum']


Now we can use our `doc2vec` model to encode our set of words as a numeric vector.

In [5]:
from TTI.doc2vec import encode_article 

print("Encoded numeric vector:\n", encode_article("".join(wordRepresentation)))

Encoded numeric vector:
 [0.000769761682022363, 0.00104153947904706, 0.001104172901250422, -0.0010914006270468235, 0.001523416256532073, -0.0008398314821533859, -0.0002713746507652104, -0.00016400939784944057, 0.0007764255278743804, 0.001134060206823051, -9.191171557176858e-05, -0.0011730362894013524, -0.0009643429075367749, 0.0011036876821890473, 0.00034393538953736424, -0.001230453490279615, 0.0005936514353379607, -0.0013741335133090615, -0.00028574577299878, 0.0012393228244036436, 0.0011314357398077846, -0.0014283099444583058, -0.0011511000338941813, -0.0007595731876790524, -0.0004886173992417753, -0.000738111964892596, -0.00018758358783088624, 0.0010885816300287843, 0.0016476402524858713, -0.00030863904976285994, 0.00033458275720477104, -0.0007464617374353111, 0.0009380666306242347, -0.00035741293686442077, -0.0013418991584330797, -0.00034419193980284035, 0.0006167967221699655, -0.0010554930195212364, -0.0007203014101833105, 0.000428474391810596, 0.0013010621769353747, 0.0015696437

Now for every class in categories graph I need to calculate the words represntation set.

In [6]:
import tqdm
import pandas as pd
import json

data = []
wordsCount = 50

for category in tqdm.tqdm(categories):
    nodeId = "Category:{}".format(category)
    wordRepresentation = getNodeWordSet(nodeId, graph, numberOfWords=wordsCount)
    numericVector = encode_article("".join(wordRepresentation))

    data.append([str(nodeId), json.dumps(wordRepresentation), json.dumps(numericVector)])

dataFrame = pd.DataFrame(data, columns=['Category', 'Words', 'Representation'])


100%|██████████| 225765/225765 [49:18<00:00, 76.31it/s]


In [7]:
dataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
1,Category:Main topic articles,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
2,Category:Academic disciplines,"[""communication"", ""articles"", ""subject"", ""scie...","[-0.0007150950841605663, 0.0004406595544423908..."
3,Category:Subfields by academic discipline,"[""areas"", ""fields"", ""information"", ""subject"", ...","[0.001037890324369073, 0.00026856939075514674,..."
4,Category:Scholars by subfield,"[""gun"", ""anthropologists"", ""cartographers"", ""s...","[0.0008707719971425831, 0.0003855710383504629,..."


We won't use csv for it cause it would be to big and slow.

In [8]:
# from TTI.config import DATA_DIR
# import os

# fileName= "training_set_{}.csv".format(wordsCount)
# filePath = os.path.join(DATA_DIR, fileName)

# dataFrame.to_csv(filePath,index_label="id")

## Saving dataset in the database

Because caluclated dataset is huge (1.3GB) the `csv` file won't be convenient. I will use SQL database insetead.

In [8]:
from TTI.config import DATABASE_PATH
import sqlite3
import os

connection = sqlite3.connect(DATABASE_PATH)

In [9]:
table_name = "training_set_{}".format(wordsCount)

dataFrame.to_sql(table_name, connection, if_exists='replace', index=False)

## Testing import data from database

In [11]:
 importedDataFrame = pd.read_sql('select * from {}'.format(table_name), connection)

In [15]:
importedDataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
1,Category:Main topic articles,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
2,Category:Academic disciplines,"[""history"", ""communication"", ""professional"", ""...","[0.0013619071105495095, 0.0009967258665710688,..."
3,Category:Subfields by academic discipline,"[""of"", ""engineering"", ""psychiatric"", ""history""...","[-0.000537737796548754, 0.0015615752199664712,..."
4,Category:Scholars by subfield,"[""of"", ""engineers"", ""geographers"", ""gun"", ""eur...","[-0.00039383742841891944, -0.00025646726135164..."
