# Generating training set

In [1]:
import sys
sys.path.append('../TTI/')
%load_ext autoreload
%autoreload 2

We need to first import the graph of categories.

In [2]:
from TTI.CategoriesGraph import CategoriesGraph

graph = CategoriesGraph()
categories = graph.categories


Reading topics graph


For every category we need to find `s` words in the neighbourshood nodes. Let's how it works with the example category "Machine learning algorithms".

In [3]:
from TTI.TrainingSet import getNodeWordSet

s = 25
wordRepresentation = getNodeWordSet("Category:Machine learning algorithms", graph, numberOfWords=s, debug=True)

Currently visiting: Machine learning algorithms
Currently visiting: Algorithms
Currently visiting: Machine learning
Currently visiting: Applied mathematics
Currently visiting: Computer algebra
Currently visiting: Algorithm description languages
Currently visiting: Algorithmic trading
Currently visiting: Approximation algorithms
Currently visiting: Behavior selection algorithms
Currently visiting: Bioinformatics algorithms
Currently visiting: Calendar algorithms
Currently visiting: Checksum algorithms
Currently visiting: Combinatorial algorithms
Currently visiting: Compression algorithms
Currently visiting: Computer arithmetic algorithms
Currently visiting: Concurrent algorithms
Currently visiting: Cryptographic algorithms
Currently visiting: Data mining algorithms


In [4]:
print("Found {} words:\n\n".format(len(wordRepresentation)), wordRepresentation)

Found 25 words:

 ['checksum', 'algorithmic', 'trading', 'compression', 'calendar', 'concurrent', 'learning', 'approximation', 'bioinformatics', 'algorithms', 'mining', 'computer', 'algebra', 'behavior', 'machine', 'languages', 'algorithm', 'mathematics', 'combinatorial', 'selection', 'description', 'applied', 'cryptographic', 'arithmetic', 'data']


Now we can use our `doc2vec` model to encode our set of words as a numeric vector.

In [5]:
from TTI.doc2vec import encode_article 

article = " ".join(wordRepresentation)

print("Encoded numeric vector:\n", encode_article(article))

Encoded numeric vector:
 [0.2933642864227295, 0.2660072445869446, -0.9961421489715576, 0.32041075825691223, 0.3270479142665863, 0.12687724828720093, 0.44342032074928284, -0.09991951286792755, 0.14271588623523712, 0.40499868988990784, -0.5897689461708069, -0.812658965587616, -0.5341119766235352, 0.07815662026405334, -0.15428920090198517, -0.22551512718200684, -0.1138901337981224, 0.20030979812145233, -0.28977683186531067, 0.5738781690597534, -0.5722239017486572, -0.3731054961681366, 0.11635565757751465, -0.6711048483848572, 0.05266357958316803, 0.1287587434053421, -0.010078768245875835, -0.2355613112449646, -0.3634723722934723, -0.6536797881126404, -0.6671356558799744, 0.3736737072467804, 0.1605260670185089, -0.2588902711868286, -0.63358473777771, 0.03217832371592522, 0.4962356686592102, -0.15260177850723267, 0.0866607055068016, -0.35474470257759094, -0.013663590885698795, -0.04957647621631622, -0.6806261539459229, -0.31088557839393616, -0.22451043128967285, 0.2916729152202606, 0.069633

Now for every class in categories graph I need to calculate the words represntation set.

In [6]:
import tqdm
import pandas as pd
import json

data = []
wordsCount = 25

for category in tqdm.tqdm(categories):
    nodeId = "Category:{}".format(category)
    wordRepresentation = getNodeWordSet(nodeId, graph, numberOfWords=wordsCount)
    numericVector = encode_article(" ".join(wordRepresentation))

    data.append([str(nodeId), json.dumps(wordRepresentation), json.dumps(numericVector)])

dataFrame = pd.DataFrame(data, columns=['Category', 'Words', 'Representation'])


  0%|          | 1000/225765 [01:34<5:53:02, 10.61it/s]


KeyboardInterrupt: 

In [7]:
dataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
1,Category:Main topic articles,"[""world"", ""articles"", ""music"", ""technology"", ""...","[9.195921302307397e-05, -0.0007309274515137076..."
2,Category:Academic disciplines,"[""communication"", ""articles"", ""subject"", ""scie...","[-0.0007150950841605663, 0.0004406595544423908..."
3,Category:Subfields by academic discipline,"[""areas"", ""fields"", ""information"", ""subject"", ...","[0.001037890324369073, 0.00026856939075514674,..."
4,Category:Scholars by subfield,"[""gun"", ""anthropologists"", ""cartographers"", ""s...","[0.0008707719971425831, 0.0003855710383504629,..."


We won't use csv for it cause it would be to big and slow.

In [8]:
# from TTI.config import DATA_DIR
# import os

# fileName= "training_set_{}.csv".format(wordsCount)
# filePath = os.path.join(DATA_DIR, fileName)

# dataFrame.to_csv(filePath,index_label="id")

## Saving dataset in the database

Because caluclated dataset is huge (1.3GB) the `csv` file won't be convenient. I will use SQL database insetead.

In [8]:
from TTI.config import DATABASE_PATH
import sqlite3
import os

connection = sqlite3.connect(DATABASE_PATH)

In [9]:
table_name = "training_set_{}".format(wordsCount)

dataFrame.to_sql(table_name, connection, if_exists='replace', index=False)

## Testing import data from database

In [11]:
 importedDataFrame = pd.read_sql('select * from {}'.format(table_name), connection)

In [15]:
importedDataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
1,Category:Main topic articles,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
2,Category:Academic disciplines,"[""history"", ""communication"", ""professional"", ""...","[0.0013619071105495095, 0.0009967258665710688,..."
3,Category:Subfields by academic discipline,"[""of"", ""engineering"", ""psychiatric"", ""history""...","[-0.000537737796548754, 0.0015615752199664712,..."
4,Category:Scholars by subfield,"[""of"", ""engineers"", ""geographers"", ""gun"", ""eur...","[-0.00039383742841891944, -0.00025646726135164..."
