In [1]:
import sys
sys.path.append('../TTI/')
%load_ext autoreload
%autoreload 2

# Taksonomia, identyfikacja tekstu

Dany jest fragment hierarchii klasyfkacji tematycznej z Wikipedii (https://en.wikipedia.org/wiki/Category:Main_topic_classifications) w postaci pliku CSV.
Klasyfkacja jest grafem spójnym, gdzie węzły są tematami, a krawędzie reprezentują uszczegółowienie tematu.

Celem projektu jest zapropnowanie i przetestowanie mechanizmu automatycznej klasyfikacji tekstu Wejściem jest plik tekstowy w języku angielskim. Wyjściem jest zbiór węzłów w/w klasyfikacji tematycznej.


## Dane wejściowe

Dane wejściowe do zadania do graf spójny o 225765 węzłach, kady węzeł reprezentuje jedną kategorię. Graf nie jest uporządkowanym drzewem, może również zawierać pętle.

In [2]:
from TTI.CategoriesGraph import CategoriesGraph

categories = CategoriesGraph()

Reading topics graph


In [3]:
print("Ilość krawędzi", categories._edge_list.shape)
print("Ilość węzłów", categories._graph.number_of_nodes())

Ilość krawędzi (339250, 2)
Ilość węzłów 225765


## Zbiór treningowy

Zbiór treningowy został przygotowany z wykorzystaniem notebooka `01-tti-training-set-generate.ipynb`. Tam jest też więcej informacji o procesie generacji.

In [6]:
from TTI.config import DATABASE_PATH
import sqlite3
import pandas as pd
import json
import numpy as np
from tensorflow.keras.utils import to_categorical

table_name = "training_set_25"
connection = sqlite3.connect(DATABASE_PATH)


In [7]:
dataset = pd.read_sql('select * from {}'.format(table_name), connection)

In [8]:
dataset["Representation"] = dataset["Representation"].apply(lambda i : json.loads(i))
dataset["Category"] = dataset["Category"].apply(lambda i : i[9:])
dataset["Words"] = dataset["Words"].apply(lambda i : json.loads(i))

In [9]:
print("Dataset size:", dataset.shape)
print("Numeric represntation vector size:", len(dataset.iloc[12]["Representation"]))
print("Number of nodes in the graph:", len(dataset.iloc[12]["Words"]))

Dataset size: (225765, 3)
Numeric represntation vector size: 300
Number of nodes in the graph: 25


In [10]:
dataset

Unnamed: 0,Category,Words,Representation
0,Main_topic_classifications,"[academic, culture, human, entertainment, heal...","[-0.3755445182323456, 0.010519789531826973, -0..."
1,Main topic articles,"[academic, culture, human, entertainment, heal...","[-0.40671899914741516, 0.013835961930453777, -..."
2,Academic disciplines,"[academic, art, academics, euthenics, studies,...","[-0.09239675104618073, -0.46590009331703186, -..."
3,Subfields by academic discipline,"[subfield, academic, areas, evolutionary, fiel...","[0.085173599421978, 0.010392077267169952, -0.3..."
4,Scholars by subfield,"[subfield, academic, architects, studies, clas...","[-0.15292514860630035, -0.5975006222724915, -0..."
...,...,...,...
225760,World Wide Web stubs,"[internet, wide, system, technology, bioinform...","[0.216136172413826, -0.024581177160143852, -0...."
225761,Internet publication stubs,"[service, wide, entertainment, online, news, s...","[0.2748589515686035, 0.2310565859079361, -0.34..."
225762,Website stubs,"[websites, service, wide, entertainment, onlin...","[0.1632257103919983, 0.16291794180870056, -0.2..."
225763,Wikimedia Foundation stubs,"[websites, service, wide, entertainment, onlin...","[0.19932252168655396, 0.19686073064804077, -0...."


## Wyszukiwanie najbardziej podobnych wektorów

Do klasyfikacji posłuże się obliczaniem odległości geometrycznej pomiędzy wektorami reprezentacji doc2vec. Wektory o najmniejszej odległości zostaną zakwalifikowane jako najbardziej podobne.

In [32]:
dataset.loc[dataset['Category'] == "Machine learning algorithms"]


Unnamed: 0,Category,Words,Representation
2692,Machine learning algorithms,"[checksum, algorithmic, trading, compression, ...","[0.302137166261673, 0.3030090630054474, -1.029..."


In [34]:
from scipy import spatial

name = dataset["Category"][2692]
vector = dataset["Representation"][2692]

print("Name of category", name)

Name of category Machine learning algorithms


Teraz należe znaleźć najbardziej podobne kategorie. 

In [35]:
import tqdm

def find_simmilar(vector, count, df):
    """ Finds 'count' best matching categories with vectors simmilar to 'vector'"""
    categories = []
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
        vec = row["Representation"]
        name = row["Category"]
        categories.append((name, spatial.distance.cosine(vector, vec)))
    sorted_categories = sorted(categories, key=lambda i: i[1])
    return sorted_categories[0:count]


In [36]:
best_matching = find_simmilar(vector, 10, dataset)

100%|██████████| 225765/225765 [00:38<00:00, 5924.71it/s]


In [37]:
best_matching

[('Machine learning algorithms', 0.0),
 ('Heuristic algorithms', 0.03048611901635534),
 ('Cryptographic algorithms', 0.033841232971325574),
 ('Computer arithmetic algorithms', 0.034831473629752474),
 ('Data mining algorithms', 0.03487279219354722),
 ('Compression algorithms', 0.035516547716642144),
 ('Digit-by-digit algorithms', 0.03591962687260464),
 ('Algorithms', 0.03608325172873006),
 ('Bioinformatics algorithms', 0.03669657755224942),
 ('Approximation algorithms', 0.03737963968331992)]

In [39]:
best_matching[1]

('Heuristic algorithms', 0.03048611901635534)

# -----------------------

In [35]:
testcases_count, _ = dataset.shape

In [52]:
X = X[:10000]
Y = Y[:10000]

In [93]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=1000, random_state=0).fit(X)

In [110]:
X[2345].shape

(300,)

In [105]:
print(dataset["Category"][2345])
print(kmeans.predict(X[2345][:]))

Category:Computational fields of study


ValueError: Expected 2D array, got 1D array instead:
array=[-1.05711957e-03 -1.12174821e-04 -6.70599227e-04  2.00716982e-04
 -1.12532716e-05 -1.19818910e-03 -1.00701256e-03  1.51766115e-03
  1.38851278e-03  1.20256539e-03 -1.02999003e-03  1.59012235e-03
 -5.12395869e-04  1.04147161e-03 -1.88910679e-04 -1.18044345e-03
  3.29556147e-04  3.28550494e-04  7.17254938e-04  1.46737450e-03
  8.61251028e-04 -5.69134543e-04  1.21984247e-03 -7.90496066e-04
 -2.82717112e-04 -1.40814681e-03  3.09491617e-04 -4.13069793e-04
 -1.36248779e-03 -3.97055119e-05 -2.56790023e-04  1.99193833e-04
  4.57675749e-04  1.64242432e-04 -1.70118772e-04  8.39911751e-04
  1.49876974e-03 -1.35887484e-03 -1.46193802e-03 -1.29698543e-03
  1.13061490e-03  4.60446638e-04 -8.44697934e-04 -4.18586103e-04
  3.92560614e-04 -1.60691852e-03  1.19609770e-03 -1.41348923e-03
  1.08640420e-03 -8.45222457e-05 -1.46952469e-03  1.60608278e-03
 -4.27623512e-04  1.15115463e-03  2.16798435e-05  1.13619689e-03
  3.14626013e-05  1.47080002e-03 -1.65881461e-03 -1.47042359e-04
 -6.80239114e-04 -1.95611901e-05  4.85829427e-04 -5.07019344e-04
  3.00552725e-04  9.96913295e-04 -1.43549743e-03 -1.75481749e-04
 -9.06471279e-04 -8.44579830e-04  5.84428723e-04  4.92196297e-04
 -2.56181112e-04 -1.59404858e-03  5.15849213e-04 -1.25946326e-03
 -5.85409929e-04 -5.06524579e-04 -1.37991458e-03  1.02065981e-03
 -1.20373932e-03  8.06969183e-04 -1.45958294e-03  5.13384468e-04
  1.95905901e-04 -6.63767976e-04  1.54347601e-03 -8.10385565e-04
  3.19290062e-04  7.88671896e-05  1.47709600e-03 -6.13130338e-04
  3.46637040e-04  9.20038205e-04 -7.93774205e-04  1.36998121e-03
 -5.28734527e-04  5.67752169e-04 -1.36192038e-03 -1.29018468e-03
 -8.35585408e-04 -8.33809827e-05  1.47322111e-03 -4.50807769e-04
  9.88051877e-04 -4.07564512e-04 -1.88141377e-04  5.08922036e-04
  1.16993244e-04  1.31190900e-04 -8.29470751e-04  4.89950820e-04
 -3.31937481e-04  1.48364645e-03  2.27783501e-04  1.62656419e-03
  6.20771374e-04  8.62140732e-04  1.27514813e-03  8.78580322e-04
 -2.34614257e-04  7.03568396e-04  1.38684700e-03  2.84489885e-04
  1.56361493e-03 -9.04207816e-04  1.19785941e-03  1.00727775e-03
  6.16470934e-04  9.29585600e-04  8.16004031e-05  1.34554191e-03
  1.27612881e-03  3.68235080e-04  1.55027490e-03  1.57834333e-03
  8.32954829e-04 -1.12463476e-05 -1.57821027e-03  3.55712225e-04
  5.90891985e-04  7.37736467e-04 -1.48874952e-03  1.29009539e-03
  3.85495689e-04  1.42908981e-03 -1.42835116e-03 -1.08473236e-03
 -4.23914869e-04  1.56111224e-03 -9.88631044e-04  2.75499944e-04
 -9.33356932e-04  1.58614735e-03 -1.27871661e-03  1.42677710e-03
 -2.57479929e-04 -2.09852413e-04 -1.68683080e-04  4.53274231e-04
 -2.96063721e-04  1.46724808e-03  5.58703905e-04 -1.85719682e-06
 -1.44393474e-04 -1.14537973e-03 -4.20980097e-04 -1.24457479e-03
  4.40137403e-04 -1.21359399e-03 -1.38612522e-03 -1.40716357e-03
  9.03638953e-04  1.58693537e-03 -1.17299310e-03 -1.04041887e-03
 -2.41254500e-04  1.37853238e-03 -9.78097669e-04  5.44907671e-05
  2.90383352e-04  8.94300989e-04 -1.56898377e-03 -5.01372910e-04
 -5.47632226e-04 -4.93048923e-04  7.08205567e-04 -4.68627986e-04
 -1.69154766e-04 -7.09208834e-04 -6.77622971e-04 -1.52228065e-04
 -1.58828567e-03 -6.60329883e-04  1.11219485e-03  1.20913342e-03
  1.45616184e-03  1.14660048e-04 -4.12935595e-04 -1.32893608e-03
  1.05107611e-03 -9.36593337e-04 -1.06567240e-04  2.92538607e-05
 -1.04429456e-03  5.49213903e-04  1.53073168e-03  1.50990603e-03
  1.02154876e-03  8.81676620e-04  9.74228722e-04 -1.49112742e-03
  4.35472874e-04  1.61600835e-03  6.56703429e-04 -9.78550524e-04
  1.24469487e-04  1.10312074e-03 -7.79370657e-06 -1.09899475e-03
  5.54834143e-04  1.57351594e-03  1.00632908e-03  1.23282941e-03
  6.04544766e-04  1.48530846e-04  1.64782943e-03  1.38901023e-03
  1.65508327e-03  6.18561462e-04 -1.88564172e-05 -3.21197032e-04
  1.34382094e-03  6.69610570e-04 -1.48776360e-03  1.17876414e-04
 -5.90742420e-05  1.44980068e-03 -3.00348678e-04 -6.79177989e-04
  9.89316846e-04 -1.26218249e-03 -4.98922716e-04  7.82176794e-04
  5.35731961e-04  1.40861818e-03  6.71846501e-05  9.16450401e-04
 -3.83602368e-04 -7.97643675e-04  6.38264231e-04 -3.91801266e-04
 -1.30973803e-03 -5.10019483e-04 -5.94273617e-04 -1.59030559e-03
 -1.48945523e-03 -1.14943890e-03  9.70767229e-04 -2.36914973e-04
 -5.97302569e-04  6.70650916e-04  1.15786667e-03  1.29027443e-03
  1.26167681e-04  3.16318707e-04  1.29502523e-03  7.64396391e-04
  3.51602561e-04 -1.48247927e-03  8.66172952e-04  1.15204370e-03
 -3.92968417e-04 -1.46732887e-03  1.63210416e-03  1.12474244e-03
 -1.53865200e-03 -5.42922120e-04 -1.44492066e-03 -1.91790052e-04
 -9.91406967e-04  1.59012794e-03 -1.02278893e-03 -9.17129917e-04
 -1.24915881e-04  9.13345022e-04  1.60187774e-03  1.38059608e-03
  2.55861174e-04 -1.17688009e-03 -1.56134414e-03 -2.89360178e-04
  1.03279937e-03 -6.60107704e-04 -1.60248950e-03  1.54751528e-03
  7.74865795e-04 -1.50330117e-04  1.29592998e-04  2.58135842e-04].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [91]:
clasterCategoryMap = {}
for index, value in enumerate(kmeans.labels_):
    if value in clasterCategoryMap:
        clasterCategoryMap[value].add(dataset["Category"][index])
    else:
        clasterCategoryMap[value] = set([dataset["Category"][index]])

In [92]:
clasterCategoryMap[11]

{'Category:(ISC)²',
 'Category:Abstract data types',
 'Category:Action theorists',
 'Category:Administrative terminology',
 'Category:Aerospace engineering award winners',
 'Category:Aerospace engineers',
 'Category:Agent-based software',
 'Category:Agroecology',
 'Category:Aircraft designers',
 'Category:Algorithmic information theory',
 'Category:Alternative medicine publications',
 'Category:American civil engineering contractors',
 'Category:Animal fats',
 'Category:Anthropological categories of peoples',
 'Category:Archaeologists',
 'Category:Archaeologists appearing on Time Team',
 'Category:Archaeologists by century',
 'Category:Art history',
 'Category:Artificial intelligence researchers',
 'Category:Asphalt lakes',
 'Category:Atmospheric sciences',
 'Category:Belgian historians of religion',
 'Category:Belief revision',
 'Category:Biblical authorship debates',
 'Category:Binary arithmetic',
 'Category:Biogeographic realms',
 'Category:Biological anthropology',
 'Category:Biolo

In [80]:
print(dataset["Words"][10])
print(dataset["Category"][10])

['archaeologist', 'anthropologists', 'fictional', 'subfield', 'or', 'armenian', 'archaeology', 'appearing', 'sinhalese', 'field', 'antiquarians', 'nationality', 'women', 'indigenous', 'ethnic', 'arab', 'by', 'burgher', 'people', 'crimean', 'igbo', 'national', 'catalan', 'tamil', 'of', 'flemish', 'occupations', 'archaeologists', 'australian', 'american', 'history', 'team', 'scientists', 'ethnicity', 'time', 'geoarchaeologists', 'origin', 'on', 'in', 'iranian', 'academics', 'basque', 'tatar', 'black', 'social', 'lists', 'stubs', 'research', 'british', 'century']
Category:Archaeologists by ethnicity


In [53]:
from sklearn.preprocessing import LabelEncoder

train_y = dataset["Category"][:10000]

label_encoder = LabelEncoder()
label_encoder.fit(train_y)
Y = to_categorical((label_encoder.transform(train_y)))

In [59]:
X = np.array([np.array(xi) for xi in dataset["Representation"][:10000]])
numeric_vector_size = len(X[0])

In [60]:
Y.shape

(10000, 10000)

In [67]:
from tensorflow.keras import models
from tensorflow.keras import layers

model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(numeric_vector_size,)))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(10000, activation='softmax'))

print(model.summary())


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 256)               77056     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 10000)           

In [68]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [69]:
history = model.fit(X, Y, epochs=200, batch_size=256)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

KeyboardInterrupt: 

In [15]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.decomposition import PCA

# pca = PCA(n_components=200)
# pca.fit(X)
# print(sum(pca.explained_variance_ratio_))


In [16]:
# model = KNeighborsClassifier(n_neighbors=3)

# model.fit(X, Y)

## Zapisywanie modelu

In [None]:
# import cPickle

# # save the classifier
# with open('models/knn.pkl', 'wb') as fid:
#     cPickle.dump(model, fid)    
