In [1]:
import networkx as nx
import json
from nltk import word_tokenize
from nltk.corpus import stopwords
import sys
import gensim, logging
import re
import os
import treetaggerwrapper
from numpy import average

Slow version of gensim.models.doc2vec is being used


Загружаем топики; создаём словарь вида: обработанный топик - исходный топик

Лемматизатор:

In [2]:
tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')

Обработка включает в себя: токенизацию, удаление знаков препинания и стопслов, лемматизацию, сведение всех слов к нижнему регистру:

In [19]:
def analyze_topic(topic):
    stop = stopwords.words('english')

    tokens = word_tokenize(topic)
    punct = ',.()":;--&?!\'s'
    tokens = [token for token in tokens if token not in punct]
    tokens = [token for token in tokens if token not in stop]
    new_tokens = []
    for token in tokens:
        lemma = tagger.tag_text(token)[0].split('\t')[-1]
        if lemma != '@card@' and lemma != '`':
            new_tokens.append(lemma)
        else:
            new_tokens.append(token)
    
    return ' '.join(new_tokens).lower()

Загружаем топики из графа:

In [4]:
G = nx.read_gexf('topics.gexf')

In [6]:
topics = G.nodes()

In [7]:
len(topics)

21549

Обрабатываем и записываем в файл:

In [20]:
new_topics = {}
for topic in topics:
#     new_topics[analyze_topic(topic)] = topic
    analyzed_topic = analyze_topic(topic)
    if analyzed_topic not in new_topics:
        new_topics[analyzed_topic] = []
    new_topics[analyzed_topic].append(topic)

In [21]:
new_topics

{'lisbon earthquake portugal 1755': ['Lisbon Earthquake, Portugal, 1755'],
 'earthquake': ['Earthquakes'],
 'woman': ['Women'],
 'ops roman deity': ['Ops (Roman deity)'],
 'snake': ['Snakes'],
 'eve biblical figure': ['Eve (Biblical figure)'],
 'temptation': ['Temptation'],
 'weave': ['Weaving'],
 'penelope greek mythology': ['Penelope (Greek mythology)'],
 'murder': ['Murder'],
 'tombs sepulchral monument': ['Tombs & sepulchral monuments'],
 'elephant': ['Elephants'],
 'book jacket': ['Book jackets'],
 'animal fiction': ['Animals--Fiction'],
 'book cover': ['Book covers'],
 'domestic life': ['Domestic life'],
 'dwelling': ['Dwellings'],
 'oromo african people': ['Oromo (African people)'],
 'canyon': ['Canyons'],
 'teenager': ['Teenagers'],
 'peddler': ['Peddlers'],
 'market': ['Markets'],
 'street': ['Streets'],
 'art japanese': ['Arts, Japanese'],
 'photography': ['Photography'],
 'jews': ['Jews'],
 'theater': ['Theater', 'Theaters'],
 'costume': ['costumes', 'Costumes', 'Costume', '

In [10]:
for topic in new_topics:
    if len(new_topics[topic]) > 1:
        print(new_topics[topic])

['Theater', 'Theaters']
['costumes', 'Costumes', 'Costume', 'Costume -- ']
['Buildings', 'Building']
['Clothing & dress', 'Clothing and dress']
['Canoes', 'canoes']
['Africans', 'African']
['Rites and ceremonies', 'Rites & ceremonies']
['Player pianos', 'player pianos', 'Player piano']
['pianos', 'Pianos', 'Piano']
['Homes and haunts', 'Homes & haunts', 'Home and haunts']
['Social life and customs', 'Social life & customs']
['Cotton Pickers', 'Cotton pickers']
['Mothers & children', 'Mother and child']
['Families', 'Family']
['Shackles', 'shackles']
['Whipping', 'whipping']
['Gods', 'God']
['Charts, diagrams, etc.', 'Charts, diagrams, etc']
['Churches', 'churches', 'Church']
['Busts', 'busts']
['Cities & towns', 'Cities and towns']
['Horseback riding', 'horseback riding']
['Rulers', 'rulers']
['Stone', 'Stoning']
['Hospitals', 'Hospital']
['Dungeons', 'dungeons']
['Feet', 'feet']
['Politics and government', 'Politics & government']
['Charity', 'Charities']
['Harpsichord', 'Harpsichords

In [24]:
with open('analyzed_topics.json', 'w') as outfile:
    json.dump(new_topics, outfile)

Загружаем словарь вида: обработанный топик - исходный топик:

In [2]:
new_topics = json.loads(open('analyzed_topics.json').read())

Загружаем модель (основанную на английской википедии):

In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format(os.path.join('word2vec_models','enwiki_5_ner.bin'), binary=True)

2017-05-30 18:56:54,652 : INFO : loading projection weights from word2vec_models\enwiki_5_ner.bin
2017-05-30 18:57:44,308 : INFO : loaded (296630, 300) matrix from word2vec_models\enwiki_5_ner.bin


In [6]:
model.init_sims(replace=True)

2017-05-30 18:57:47,010 : INFO : precomputing L2-norms of word weight vectors


In [6]:
len(new_topics)

20942

In [None]:
for i in topic_vectors:
    print(i)

Lisbon Earthquake, Portugal, 1755
Earthquakes
Women
Ops (Roman deity)
Snakes
Eve (Biblical figure)
Temptation
Weaving
Penelope (Greek mythology)
Murder
Tombs & sepulchral monuments
Elephants
Book jackets
Animals--Fiction
Book covers
Domestic life
Dwellings
Oromo (African people)
Canyons
Teenagers
Peddlers
Markets
Streets
Arts, Japanese
Photography
Jews
Theater
Theaters
costumes
Costumes
Costume
Costume -- 
Art, Modern
Art deco
Decoration and ornament
Letter-pictures
Postal service
Buildings
Building
Roads
Houses
Mountains
Settlements
Castles & palaces
Telegraph
Landscape architecture facilities
Architecture
Landscape pavilions
Fula (African people)
Tramps
Boys
Sudanese
Muslims
Clothing & dress
Clothing and dress
Hausa (African people)
Rivers
Canoes
canoes
Queens
Africans
African
Nupe (African people)
Villages
Nobility
Captives
Rites and ceremonies
Rites & ceremonies
Yoruba (African people)
Turkmen -- Clothing & dress
Turkmen
Water carriers
Forts & fortifications
Yurts
Circus animals
Pe

Проверяем, есть ли в модели коллокации, разделённые пробелами:

In [None]:
for word in model.vocab:
    if ' ' in word:
        print(word)

Составляем словарь вида: слово из модели - массив из 10-ти семантически самых близких ему слов. Словарь нужен, чтобы не использовать модель каждый раз.

In [6]:
c = 0
model_in_dic = {}
for word in model.vocab:
    c += 1
    model_in_dic[word] = []
    for i in model.most_similar(positive=[word]):
        sim_word = i[0].split('_')[0]
        model_in_dic[word].append(sim_word)
    if c == 1000:
        print(c)
        c = 0

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000


In [7]:
with open('model_in_dic.json', 'w') as outfile:
    json.dump(model_in_dic, outfile)

In [8]:
len(model_in_dic)

296630

In [None]:
with open('most_similar.json', 'w') as outfile:
    json.dump(model_in_dic, outfile)

Функция ищет topn ближайших по косинусному расстоянию слов; ищет эти слова в топиках; возвращает первые resultsn топиков, в которых нашлись эти "синонимы"

In [35]:
def sim_word2vec(word, results_n=10):
    pos_tags = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB']
    results = []

    for pos_tag in pos_tags:
#         print(pos_tag)
        word_tagged = word.replace(' ', '::') + '_' + pos_tag
        if word_tagged in model:
            for i in model.most_similar(positive=[word_tagged]):
                sim_word = i[0].split('_')[0].replace('::', '(-| )')
#                 print(sim_word, i[1])
                for topic in new_topics:
                    n = re.search('(\s|^)' + sim_word.lower() + '(\s|$)', topic)
                    if n is not None and word not in topic:
                        result = new_topics[topic] 
                        for r in result:
                            if r not in results:
                                 results.append(r)
                            if len(results) >= results_n:
                                return results

    return results

In [41]:
sim_word2vec('finger')

thumb 0.7493150234222412
forefinger 0.7490416169166565
fingertip 0.6901481747627258
wrist 0.6607204675674438
forearm 0.6588547229766846
hand 0.6340612769126892


['Tom Thumb (Tale)',
 'Hand railings',
 'Hands',
 'Hand',
 'hands',
 'hand tools',
 'Planes (Hand tools)',
 'Hand weaving',
 'Shaking hands',
 'Hand lenses']