# Generating training set

In [1]:
import sys
sys.path.append('../TTI/')
%load_ext autoreload
%autoreload 2

We need to first import the graph of categories.

In [2]:
from TTI.CategoriesGraph import CategoriesGraph

graph = CategoriesGraph()
categories = graph.categories


Reading topics graph


For every category we need to find `s` words in the neighbourshood nodes. Let's how it works with the example category "Machine learning algorithms".

In [3]:
from TTI.TrainingSet import getNodeWordSet

s = 25
wordRepresentation = getNodeWordSet("Category:Machine learning algorithms", graph, numberOfWords=s, debug=True)

Currently visiting: Machine learning algorithms
Currently visiting: Algorithms
Currently visiting: Machine learning
Currently visiting: Applied mathematics
Currently visiting: Computer algebra
Currently visiting: Algorithm description languages
Currently visiting: Algorithmic trading
Currently visiting: Approximation algorithms
Currently visiting: Behavior selection algorithms
Currently visiting: Bioinformatics algorithms
Currently visiting: Calendar algorithms
Currently visiting: Checksum algorithms
Currently visiting: Combinatorial algorithms
Currently visiting: Compression algorithms
Currently visiting: Computer arithmetic algorithms
Currently visiting: Concurrent algorithms
Currently visiting: Cryptographic algorithms
Currently visiting: Data mining algorithms


In [4]:
print("Found {} words:\n\n".format(len(wordRepresentation)), wordRepresentation)

Found 25 words:

 ['description', 'calendar', 'combinatorial', 'algorithms', 'concurrent', 'applied', 'data', 'bioinformatics', 'algorithm', 'trading', 'cryptographic', 'languages', 'algebra', 'machine', 'selection', 'mathematics', 'learning', 'approximation', 'mining', 'compression', 'arithmetic', 'algorithmic', 'computer', 'checksum', 'behavior']


Now we can use our `doc2vec` model to encode our set of words as a numeric vector.

In [10]:
from TTI.doc2vec import encode_article 

article = " ".join(wordRepresentation)

print("Encoded numeric vector:\n", encode_article(article))

Encoded numeric vector:
 [0.31989720463752747, 0.29779136180877686, -0.9996739625930786, 0.2990374267101288, 0.32499173283576965, 0.14868147671222687, 0.39899319410324097, -0.11946247518062592, 0.08580970764160156, 0.3709932863712311, -0.6376591324806213, -0.7991275191307068, -0.5793975591659546, 0.030618006363511086, -0.09017492830753326, -0.21176928281784058, -0.036709368228912354, 0.20612168312072754, -0.3078257441520691, 0.5615618228912354, -0.5726958513259888, -0.3254864811897278, 0.09675252437591553, -0.6349241137504578, 0.03299860283732414, 0.17679263651371002, 0.00012179146142443642, -0.19691196084022522, -0.38736140727996826, -0.6514764428138733, -0.5733559131622314, 0.36191919445991516, 0.1784226894378662, -0.23787179589271545, -0.5842075943946838, -0.03145549073815346, 0.46896013617515564, -0.11403825134038925, 0.1308974325656891, -0.3464210033416748, -0.025761792436242104, -0.05240115150809288, -0.7015578746795654, -0.3506559729576111, -0.23557838797569275, 0.34051316976547

Now for every class in categories graph I need to calculate the words represntation set.

In [6]:
import tqdm
import pandas as pd
import json

data = []
wordsCount = 50

for category in tqdm.tqdm(categories):
    nodeId = "Category:{}".format(category)
    wordRepresentation = getNodeWordSet(nodeId, graph, numberOfWords=wordsCount)
    numericVector = encode_article(" ".join(wordRepresentation))

    data.append([str(nodeId), json.dumps(wordRepresentation), json.dumps(numericVector)])

dataFrame = pd.DataFrame(data, columns=['Category', 'Words', 'Representation'])


100%|██████████| 225765/225765 [49:18<00:00, 76.31it/s]


In [7]:
dataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
1,Category:Main topic articles,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
2,Category:Academic disciplines,"[""communication"", ""articles"", ""subject"", ""scie...","[-0.0007150950841605663, 0.0004406595544423908..."
3,Category:Subfields by academic discipline,"[""areas"", ""fields"", ""information"", ""subject"", ...","[0.001037890324369073, 0.00026856939075514674,..."
4,Category:Scholars by subfield,"[""gun"", ""anthropologists"", ""cartographers"", ""s...","[0.0008707719971425831, 0.0003855710383504629,..."


We won't use csv for it cause it would be to big and slow.

In [8]:
# from TTI.config import DATA_DIR
# import os

# fileName= "training_set_{}.csv".format(wordsCount)
# filePath = os.path.join(DATA_DIR, fileName)

# dataFrame.to_csv(filePath,index_label="id")

## Saving dataset in the database

Because caluclated dataset is huge (1.3GB) the `csv` file won't be convenient. I will use SQL database insetead.

In [8]:
from TTI.config import DATABASE_PATH
import sqlite3
import os

connection = sqlite3.connect(DATABASE_PATH)

In [9]:
table_name = "training_set_{}".format(wordsCount)

dataFrame.to_sql(table_name, connection, if_exists='replace', index=False)

## Testing import data from database

In [11]:
 importedDataFrame = pd.read_sql('select * from {}'.format(table_name), connection)

In [15]:
importedDataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
1,Category:Main topic articles,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
2,Category:Academic disciplines,"[""history"", ""communication"", ""professional"", ""...","[0.0013619071105495095, 0.0009967258665710688,..."
3,Category:Subfields by academic discipline,"[""of"", ""engineering"", ""psychiatric"", ""history""...","[-0.000537737796548754, 0.0015615752199664712,..."
4,Category:Scholars by subfield,"[""of"", ""engineers"", ""geographers"", ""gun"", ""eur...","[-0.00039383742841891944, -0.00025646726135164..."
