# Generating training set

In [1]:
import sys
sys.path.append('../TTI/')
%load_ext autoreload
%autoreload 2

We need to first import the graph of categories.

In [2]:
from TTI.CategoriesGraph import CategoriesGraph

graph = CategoriesGraph()
categories = graph.categories


Reading topics graph


For every category we need to find `s` words in the neighbourshood nodes. Let's how it works with the example category "Machine learning algorithms".

In [3]:
from TTI.TrainingSet import getNodeWordSet

s = 50
wordRepresentation = getNodeWordSet("Category:Machine learning algorithms", graph, numberOfWords=s, debug=True)

Currently visiting: Machine learning algorithms
Currently visiting: Algorithms
Currently visiting: Machine learning
Currently visiting: Applied mathematics
Currently visiting: Computer algebra
Currently visiting: Algorithm description languages
Currently visiting: Algorithmic trading
Currently visiting: Approximation algorithms
Currently visiting: Behavior selection algorithms
Currently visiting: Bioinformatics algorithms
Currently visiting: Calendar algorithms
Currently visiting: Checksum algorithms
Currently visiting: Combinatorial algorithms
Currently visiting: Compression algorithms
Currently visiting: Computer arithmetic algorithms
Currently visiting: Concurrent algorithms
Currently visiting: Cryptographic algorithms
Currently visiting: Data mining algorithms
Currently visiting: Database algorithms
Currently visiting: Digit-by-digit algorithms
Currently visiting: Digital signal processing
Currently visiting: Distributed algorithms
Currently visiting: Error detection and correction

In [4]:
print("Found {} words:\n\n".format(len(wordRepresentation)), wordRepresentation)

Found 50 words:

 ['protocols', 'checksum', 'correction', 'detection', 'mathematics', 'digit-by-digit', 'behavior', 'error', 'algorithmic', 'languages', 'algorithm', 'distributed', 'compression', 'algorithms', 'arithmetic', 'calendar', 'database', 'trading', 'memory', 'approximation', 'line', 'learning', 'computer', 'group', 'machine', 'heuristic', 'signal', 'mining', 'clipping', 'data', 'bioinformatics', 'processing', 'fair', 'external', 'cryptographic', 'fingerprinting', 'concurrent', 'selection', 'digital', 'and', 'division', 'graph', 'combinatorial', 'applied', 'algebra', 'theory', 'computational', 'evolutionary', 'matrix', 'description']


Now we can use our `doc2vec` model to encode our set of words as a numeric vector.

In [5]:
from TTI.doc2vec import encode_article 

print("Encoded numeric vector:\n", encode_article("".join(wordRepresentation)))

Encoded numeric vector:
 [0.0016128408024087548, 0.0012727836146950722, -0.00127948890440166, -0.0005845176056027412, 0.00039609483792446554, 0.0007828212110325694, -0.0015695153269916773, -7.784669287502766e-05, -0.0011489606695249677, 0.001644849544391036, -0.0002112247166223824, 9.578494791639969e-05, 0.00015863182488828897, -0.0003390329657122493, 0.0012290972517803311, -0.0005746546667069197, -0.0009036296396516263, -0.000894396158400923, -0.0015278817154467106, 0.00039894101792015135, 0.0013593629701063037, -9.400719136465341e-05, 0.0008738027536310256, -0.0002559769491199404, -0.001644417061470449, 0.00034331606002524495, -0.0003985276853200048, 0.0016410311218351126, -0.0011706610675901175, 0.0010936153121292591, 0.0007323250756599009, -0.00130962033290416, -0.0007056671311147511, 0.00013713809312321246, -0.0015067275380715728, 0.0002579405263531953, -0.0011069539468735456, -0.00037781655555590987, 0.0016524329548701644, 0.00048557616537436843, 0.0001989162847166881, 0.00140303

Now for every class in categories graph I need to calculate the words represntation set.

In [6]:
import tqdm
import pandas as pd
import json

data = []
wordsCount = 50

for category in tqdm.tqdm(categories[:10]):
    nodeId = "Category:{}".format(category)
    wordRepresentation = getNodeWordSet(nodeId, graph, numberOfWords=wordsCount)
    numericVector = encode_article("".join(wordRepresentation))

    data.append([str(nodeId), json.dumps(wordRepresentation), json.dumps(numericVector)])

dataFrame = pd.DataFrame(data, columns=['Category', 'Words', 'Representation'])


100%|██████████| 10/10 [00:00<00:00, 100.78it/s]


In [7]:
dataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""subfields"", ""philosophy"", ""entertainment"", ""...","[0.0015765804564580321, 0.0009858924895524979,..."
1,Category:Main topic articles,"[""subfields"", ""philosophy"", ""entertainment"", ""...","[0.0015765804564580321, 0.0009858924895524979,..."
2,Category:Academic disciplines,"[""subfields"", ""communication"", ""art"", ""develop...","[0.0005224531632848084, -0.001038121059536934,..."
3,Category:Subfields by academic discipline,"[""subfields"", ""philosophy"", ""sociology"", ""fore...","[-0.0007273228256963193, -0.001215422526001930..."
4,Category:Scholars by subfield,"[""subfields"", ""anthropologists"", ""philosophy"",...","[5.3492618462769315e-05, 9.856982796918601e-05..."


We won't use csv for it cause it would be to big and slow.

In [8]:
# from TTI.config import DATA_DIR
# import os

# fileName= "training_set_{}.csv".format(wordsCount)
# filePath = os.path.join(DATA_DIR, fileName)

# dataFrame.to_csv(filePath,index_label="id")

## Saving dataset in the database

Because caluclated dataset is huge (1.3GB) the `csv` file won't be convenient. I will use SQL database insetead.

In [9]:
from TTI.config import DATABASE_PATH
import sqlite3
import os

connection = sqlite3.connect(DATABASE_PATH)

In [10]:
table_name = "training_set_{}".format(wordsCount)

dataFrame.to_sql(table_name, connection, if_exists='replace', index=False)

## Testing import data from database

In [11]:
 importedDataFrame = pd.read_sql('select * from {}'.format(table_name), connection)

In [15]:
importedDataFrame.head()

Unnamed: 0,Category,Words,Representation
0,Category:Main_topic_classifications,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
1,Category:Main topic articles,"[""knowledge"", ""history"", ""education"", ""article...","[0.0015570593532174826, 0.00048521943972446024..."
2,Category:Academic disciplines,"[""history"", ""communication"", ""professional"", ""...","[0.0013619071105495095, 0.0009967258665710688,..."
3,Category:Subfields by academic discipline,"[""of"", ""engineering"", ""psychiatric"", ""history""...","[-0.000537737796548754, 0.0015615752199664712,..."
4,Category:Scholars by subfield,"[""of"", ""engineers"", ""geographers"", ""gun"", ""eur...","[-0.00039383742841891944, -0.00025646726135164..."


## Importing already created dataset from csv to database

In [12]:
from TTI.config import DATA_DIR
import os

fileName= "training_set_{}.csv".format(wordsCount)
filePath = os.path.join(DATA_DIR, fileName)

# Importing csv
dataFrameFromCsv = pd.read_csv(fileName)

NameError: name 'DATA_DIR' is not defined