In [5]:
import networkx as nx
import json
import os
import matplotlib.pyplot as plt
import re
from random import sample as random_sample

Создаём пустой граф. Перебираем коллекции с топиками и заполняем топиками граф:

In [61]:
G = nx.Graph()

In [62]:
path = './filled_topics'

In [64]:
for root, dirs, files in os.walk(path):
    for file in files:
        print(file)
        collection = json.loads(open(os.path.join(path, file), 'r').read())
        for item in collection:
            if len(collection[item]) > 1:
                for i in range(len(collection[item])-1):
                    for j in range(i+1, len(collection[item])):
                        node_1 = collection[item][i]
                        node_2 = collection[item][j]
                        # учитываем кол-во совместных встречаний двух топиков: вес ребра = кол-во встречаний
                        if G.has_edge(node_1, node_2):
                            G.edge[node_1][node_2]['weight'] += 1
                        else:
                            G.add_edge(node_1, node_2, weight = 1)
            else:
                G.add_node(collection[item][0])

topics_00325fd0-c607-012f-6125-58d385a7bc34.json
topics_00332790-c62b-012f-f4ba-58d385a7bc34.json
topics_004d61c0-c60a-012f-c746-58d385a7bc34.json
topics_00767000-f4d5-0131-3d91-58d385a7b928.json
topics_00a0b940-c604-012f-237d-58d385a7bc34.json
topics_00bb5980-952a-0132-c0c6-58d385a7bbd0.json
topics_00d06f30-525b-0132-a30e-58d385a7b928.json
topics_00fbbcb0-c61f-012f-36d2-58d385a7bc34.json
topics_0103a020-c605-012f-b84a-58d385a7bc34.json
topics_013e1e10-c609-012f-3e8d-58d385a7bc34.json
topics_0146e060-c530-012f-1e6f-58d385a7bc34.json
topics_015f65f0-c609-012f-b4b6-58d385a7bc34.json
topics_0160f0f0-c6ec-012f-5675-3c075448cc4b.json
topics_01683640-c608-012f-59b4-58d385a7bc34.json
topics_017492e0-c6c1-012f-fcf1-3c075448cc4b.json
topics_01799bd0-c606-012f-06e7-58d385a7bc34.json
topics_01872af0-c62a-012f-6efc-58d385a7bc34.json
topics_018b4210-c6b6-012f-6356-58d385a7bc34.json
topics_018bd5e0-c617-012f-d0e7-58d385a7bc34.json
topics_01a9aa90-c605-012f-fb7e-58d385a7bc34.json
topics_01abe100-c6b8

Смотрим, что получилось:

In [3]:
len(G.nodes())

21549

In [66]:
G.nodes()[:10]

['Lisbon Earthquake, Portugal, 1755',
 'Earthquakes',
 'Women',
 'Ops (Roman deity)',
 'Snakes',
 'Eve (Biblical figure)',
 'Temptation',
 'Weaving',
 'Penelope (Greek mythology)',
 'Murder']

In [45]:
for edge in G.edges_iter():
    print(edge, G.get_edge_data(edge[0], edge[1]))

Записываем и читаем граф:

In [67]:
nx.write_gexf(G, 'topics.gexf')

In [2]:
G = nx.read_gexf('topics.gexf')

Рисуем граф:

In [3]:
G_small = nx.read_gexf('topics_part.gexf')

In [None]:
pos = nx.circular_layout(G_small)
nx.draw_networkx_nodes(G_small, pos, node_color='red', node_size=50)
nx.draw_networkx_edges(G_small, pos, edge_color='yellow')
nx.draw_networkx_labels(G_small, pos, font_size=10, font_family='Arial')
plt.axis('off')
plt.show()
# plt.savefig('graph_circular.png', format='PNG')

Общий граф слишком большой, чтобы его нарисовать

In [19]:
pos = nx.random_layout(G)
nx.draw_networkx_nodes(G, pos, node_color='red', node_size=0.1)
nx.draw_networkx_edges(G, pos, edge_color='yellow')
plt.axis('off')
plt.show()
# plt.savefig('graph_spring_full.png', format='PNG')

Загружаем топики:

In [8]:
new_topics = json.loads(open('analyzed_topics.json').read())

In [5]:
len(new_topics)

20941

In [9]:
freq = {}
for topic in new_topics:
    for word in topic.split(' '):
        if word not in freq:
            freq[word] = 0
        freq[word] += 1

In [10]:
len(freq)

10631

In [11]:
freq_list = []
for word in freq:
    freq_list.append([word, freq[word]])
freq_list = sorted(freq_list, key=lambda x: x[1], reverse=True)

In [15]:
freq_list_part = [x for x in freq_list if x[1]>10]

In [42]:
random_sample(freq_list_part, len(freq_list_part))[:20]

[['social', 59],
 ['brazilian', 34],
 ['facility', 45],
 ['song', 61],
 ['biography', 100],
 ['line', 11],
 ['window', 13],
 ['canada', 29],
 ['interior', 18],
 ['palm', 12],
 ['food', 45],
 ['electric', 23],
 ['armament', 42],
 ['1770-1779', 45],
 ['graphic', 14],
 ['sailing', 13],
 ['bath', 20],
 ['connecticut', 12],
 ['persecution', 12],
 ['mississippi', 15]]

Функция ищет слово среди топиков, выводит их nodesn штук, а также, если топики находятся, выдаёт recsn их ближайших соседей:

In [29]:
def sim_graph(word, results_n=10):   
    found_topics = []
    results = []
    recs = set()
#     weights = set()
    
    for topic in new_topics:
        n = re.search('(\s|^)' + word.lower() + '(\s|$)', topic)
        if n is not None:
            for node_1 in new_topics[topic]:
                for node_2 in G.neighbors(node_1):
                    if node_2 not in recs:
                        n = re.search('(\s|^|\W)' + word.lower() + '(\s|$|\W)', node_2.lower())
                        if n is None:
                            recs.add(node_2)
                            weight = G.edge[node_1][node_2]['weight']
                            results.append([node_2, weight])
#                     weights.add(weight)

    if results != []:
        results = sorted(results, key=lambda x: x[1], reverse=True)[:results_n]
        results = [x[0] for x in results]
#         med_weight = median(list(weights))
#         results = [x[0] for x in results if x[1] > med_weight]

    return results

In [30]:
sim_graph('clothing')

['Women',
 'Hats',
 'Men',
 'Dresses',
 'Coats',
 'Children',
 'Shoes',
 'Hairstyles',
 'Bonnets',
 'Girls']

Данные по количеству соседей у узлов:

In [75]:
degrees = {}
for node in G.nodes():
    degree =  G.degree(node)
    if degree not in degrees:
        degrees[degree] = 0
    degrees[degree] += 1
degrees

{0: 539,
 1: 1374,
 2: 1781,
 3: 1912,
 4: 1886,
 5: 1709,
 6: 1457,
 7: 1082,
 8: 872,
 9: 704,
 10: 574,
 11: 480,
 12: 457,
 13: 415,
 14: 312,
 15: 317,
 16: 288,
 17: 231,
 18: 233,
 19: 195,
 20: 179,
 21: 162,
 22: 149,
 23: 149,
 24: 119,
 25: 121,
 26: 113,
 27: 93,
 28: 91,
 29: 110,
 30: 103,
 31: 90,
 32: 94,
 33: 80,
 34: 63,
 35: 75,
 36: 75,
 37: 56,
 38: 59,
 39: 59,
 40: 65,
 41: 76,
 42: 60,
 43: 60,
 44: 36,
 45: 47,
 46: 36,
 47: 49,
 48: 42,
 49: 36,
 50: 46,
 51: 43,
 52: 36,
 53: 31,
 54: 43,
 55: 49,
 56: 33,
 57: 37,
 58: 39,
 59: 39,
 60: 29,
 61: 23,
 62: 39,
 63: 29,
 64: 23,
 65: 33,
 66: 28,
 67: 24,
 68: 26,
 69: 16,
 70: 26,
 71: 26,
 72: 15,
 73: 18,
 74: 17,
 75: 26,
 76: 22,
 77: 17,
 78: 24,
 79: 23,
 80: 23,
 81: 11,
 82: 14,
 83: 17,
 84: 11,
 85: 11,
 86: 12,
 87: 18,
 88: 11,
 89: 20,
 90: 14,
 91: 19,
 92: 9,
 93: 18,
 94: 24,
 95: 14,
 96: 13,
 97: 13,
 98: 14,
 99: 13,
 100: 12,
 101: 9,
 102: 13,
 103: 11,
 104: 14,
 105: 7,
 106: 14,
 107: 6

In [60]:
degrees = {}
for node in G.nodes():
    degree =  G.degree(node)
    if degree not in degrees:
        degrees[degree] = []
    degrees[degree].append(node)
degrees

{0: ['Operatic scenes',
  'Lakes--China',
  'Parks--China',
  'Gutters (Streets)',
  'Hotels--China',
  'Lakes--Sri Lanka',
  'Villages--China',
  'Stūpas--China',
  'Latvian fiction',
  'Printing--Periodicals',
  'Religious literature',
  'Ave Maria (Choreographic work : Duncan, Isadora)',
  'Academic freedom',
  'Farewell address',
  'Mangrove swamps',
  'Television composers',
  'Women violinists',
  'Rock music fans',
  'Impresarios',
  'Independence movements',
  'Origins',
  'Clothing and dress--China',
  'Faust (Legendary character)',
  'Road maps',
  'Egba (African people)',
  'Cryptography',
  'Supermarkets',
  "Artist's studios",
  'Statue of Liberty (New York, N.Y.) -- 1890-1899',
  'Niblo’s Garden Theatre',
  'Houses -- New York (State) -- New York -- 1900-1909',
  'Arbors (Bowers) -- New York (State) -- New York -- 1800-1899',
  'Galleries & museums -- New York (State) -- New York -- 1860-1869',
  'Manhattan (New York, N.Y.)',
  'Vessels -- New York (State) -- New York -- 