In [1]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [2]:
from bertopic import BERTopic


In [3]:
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression

In [4]:
import glob
import json
import os.path

In [5]:
from nltk.stem import WordNetLemmatizer

In [6]:
from nltk import pos_tag
from nltk.corpus import wordnet

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
import pandas as pd

In [9]:
from sklearn.feature_extraction.text import CountVectorizer


In [10]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [11]:
def get_sim(u,v):

    emb1 = embedder.encode(u)
    if isinstance(v, list):
        v = ' '.join(v)
    emb2 = embedder.encode(v)
    
    return float(util.cos_sim(emb1, emb2)[0][0])

In [12]:
def concatenate_chunks(string_list, chunk_size):
    if chunk_size > len(string_list):
        return [" ".join(string_list)]
    else:
        chunks = []
        for i in range(0, len(string_list), chunk_size):
            chunk = "".join(string_list[i:i + chunk_size])
            chunks.append(chunk)
        return chunks

In [13]:
not_stemmed_word = pd.read_csv("../text_processing/ped_not_stemmed_word_based_wiki_score.csv")
corpus = not_stemmed_word["word"].tolist()

In [14]:
with open('../text_processing/data/wiki_l1_ped_link_pt.json', 'r') as fp:
    all_links = json.load(fp)

In [15]:
corpus = list(all_links.keys())

In [16]:
len(set(list(all_links.keys())))

737

In [17]:
len(corpus)

737

In [18]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [22]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/xtest/nltk_data...


True

In [23]:
lem_words = []
for el in corpus:
    w = lemmatizer.lemmatize(el, get_wordnet_pos(el))
    lem_words.append(w)
lem_words = list(set(lem_words))

In [24]:
sim_s = []
for el in lem_words:
    sim_s.append(get_sim(el, 'pedestrian'))

In [25]:
df = pd.DataFrame({'words':lem_words, 'score':sim_s})
df = df.sort_values(by=['score'], ascending=False)
df

Unnamed: 0,words,score
159,pedestrian,1.000000
489,pedestrianize,0.807013
476,pedestrianism,0.792180
605,pedestriantraffic,0.781348
426,pedestrian crossing,0.756751
...,...,...
522,prose,0.079213
19,fundraise,0.065307
495,auto-free zones,0.052380
478,shared use path,0.052221


In [26]:
df = df[df['score'] > df['score'].quantile(0.25)]
df

Unnamed: 0,words,score
159,pedestrian,1.000000
489,pedestrianize,0.807013
476,pedestrianism,0.792180
605,pedestriantraffic,0.781348
426,pedestrian crossing,0.756751
...,...,...
481,british english,0.194685
635,unbelted,0.194659
420,gateless,0.194155
312,auto,0.194069


In [27]:
corpus = lem_words

In [28]:
"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
"""

corpus_embeddings = embedder.encode(corpus)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform clustering
# clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=2) #, affinity='cosine', linkage='average', distance_threshold=0.4)

clustering_model = AgglomerativeClustering(n_clusters=None, metric='cosine', linkage='average', distance_threshold=0.75) #, affinity='cosine', linkage='average', distance_threshold=0.4)


clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])



In [29]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  6
['floating bridge', 'cobblestone', 'walkway', 'neighborhood', 'car', 'oneway', 'public transportation', 'motorist', 'trafficcircle', 'urban sprawl', 'bosporus bridge', 'bridle path', 'urban', 'alley', 'toll plaza', 'scenic', 'highway', 'route', 'promenade', 'pedestrian bridge', 'peterborough', 'go someplace', 'hiker', 'thoroughfare', 'journey', 'town', 'eastbound', 'rightofway', 'north-south', 'railroadtrack', 'hike', 'motorize', 'motor scooter', 'stonehaven', 'vehicular traffic', 'residential', 'southbound', 'canoe', 'mobility scooter', 'area', 'permanentway', 'crossing', 'europe', 'landscape', 'vehicular', 'pedestrian', 'urban renewal', 'york', 'westbound', 'boardwalk', 'downtown', 'lane', 'transportation', 'facility', 'spaghetti junction', 'travel', 'pavement', 'transportationsystem', 'crossstreet', 'pathway', 'overbridge', 'multistorey', 'tunnel', 'door', 'road surface', 'one-lane', 'side', 'two-lane', 'one-way', 'bus-only', 'corridor', 'trekker', 'maze', 'entrance', 'tr

In [30]:
len(clustered_sentences.items())

45

In [31]:
len(corpus)

670

In [32]:
from sklearn.cluster import KMeans


In [33]:

"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""

corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 30
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    print(f's-id: {sentence_id}, c-id: {cluster_id}')
    clustered_sentences[cluster_id].append(corpus[sentence_id])



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
s-id: 0, c-id: 29
s-id: 1, c-id: 19
s-id: 2, c-id: 0
s-id: 3, c-id: 26
s-id: 4, c-id: 11
s-id: 5, c-id: 26
s-id: 6, c-id: 0
s-id: 7, c-id: 15
s-id: 8, c-id: 7
s-id: 9, c-id: 13
s-id: 10, c-id: 19
s-id: 11, c-id: 13
s-id: 12, c-id: 5
s-id: 13, c-id: 21
s-id: 14, c-id: 28
s-id: 15, c-id: 25
s-id: 16, c-id: 24
s-id: 17, c-id: 8
s-id: 18, c-id: 10
s-id: 19, c-id: 7
s-id: 20, c-id: 25
s-id: 21, c-id: 11
s-id: 22, c-id: 27
s-id: 23, c-id: 14
s-id: 24, c-id: 21
s-id: 25, c-id: 8
s-id: 26, c-id: 15
s-id: 27, c-id: 6
s-id: 28, c-id: 5
s-id: 29, c-id: 0
s-id: 30, c-id: 15
s-id: 31, c-id: 14
s-id: 32, c-id: 21
s-id: 33, c-id: 13
s-id: 34, c-id: 25
s-id: 35, c-id: 6
s-id: 36, c-id: 3
s-id: 37, c-id: 3
s-id: 38, c-id: 24

In [34]:
len(corpus_embeddings)

670

In [35]:
type(corpus_embeddings)

numpy.ndarray

In [36]:
clustering_model.score(embedder.encode(["pedestrian"]))

-0.2785797119140625

In [37]:
len(clustering_model.cluster_centers_)

30

In [38]:
c_id = []
labels = []
avg_score = []

for i, cluster in enumerate(clustered_sentences):
    c_sim = 0
    for el in cluster:
        c_sim += get_sim(el, 'pedestrian')
    print(f"average sim cluster {i} to pedestrian is: {c_sim/len(cluster)}")
    
    print(cluster)
    c_id.append(i)
    labels.append(cluster)
    avg_score.append(c_sim/len(cluster))
    print("")

average sim cluster 0 to pedestrian is: 0.20563234695616892
['cobblestone', 'floozie', 'shank nag', 'playmate', 'shank mare', 'sashay', 'ian botham', 'rambler', 'taligrade', 'ambler', 'bunion', 'toddle', 'hamble', 'palmigrade', 'robert barclay allardice', 'roundhouse kick', 'patten']

average sim cluster 1 to pedestrian is: 0.27524380683898925
['accident', 'plaintiff', 'handicapped', 'injury', 'robinson', 'at bat', 'violation', 'advocacy', 'protect', 'safety', 'obstruction', 'negligence', 'collision', 'fatality', 'lack', 'interference', 'interurban', 'gephyrophobia', 'unbelted', 'bark']

average sim cluster 2 to pedestrian is: 0.2556475274264812
['pace', 'stalker', 'distance', 'mile', 'last mile', 'space', 'ward', 'twelve inch', 'protégé', 'wade']

average sim cluster 3 to pedestrian is: 0.32445978709295686
['trafficcircle', 'interchange', 'bridle path', 'highway', 'route', 'thoroughfare', 'eastbound', 'drivearound', 'southbound', 'westbound', 'boardwalk', 'lane', 'pavement', 'road sur

average sim cluster 24 to pedestrian is: 0.2592041633766273
['neighborhood', 'urban sprawl', 'urban', 'peterborough', 'plaza', 'town', 'stonehaven', 'residential', 'area', 'urban renewal', 'york', 'downtown', 'facility', 'multistorey', 'tram', 'district', 'shopping', 'london', 'localroad', 'precinct', 'change location', 'city', 'bedroomcommunity', 'auto-free zones', 'interior', 'amenity', 'zone', 'urbanist', 'mall']

average sim cluster 25 to pedestrian is: 0.30682008102050295
['speed', 'car', 'motorist', 'motorize', 'motor scooter', 'movement', 'canoe', 'mobility scooter', 'roller skates', 'wheelchair', 'stroller', 'carpooler', 'skateboard', 'auto', 'vehicle', 'unmotorized', 'nondriver', 'locomotion', 'driver', 'automobile', 'jaywalker', 'motor', 'internal combustion engine']

average sim cluster 26 to pedestrian is: 0.22477058297203434
['drunk', 'headon', 'man', 'character', 'world', 'verse', 'hour', 'night', 'time', 'day', 'statue', 'purpose', 'bustling', 'bed', 'heart', 'mountain l

In [39]:
labels[3]

['trafficcircle',
 'interchange',
 'bridle path',
 'highway',
 'route',
 'thoroughfare',
 'eastbound',
 'drivearound',
 'southbound',
 'westbound',
 'boardwalk',
 'lane',
 'pavement',
 'road surface',
 'yellowline',
 'one-lane',
 'two-lane',
 'trafficcop',
 'roundabout',
 'pound pavement',
 'street',
 'pavementartist',
 'roadsurface',
 'trafficwarden',
 'frontage road',
 'intersection',
 'asphalt',
 'directional',
 'freeway',
 'trail',
 'three-lane',
 'streetsign',
 'curbless',
 'taxistand',
 'trafficisland',
 'trafficlane',
 'four-lane',
 'roadway',
 'earthbound',
 'interstate',
 'roadhog',
 'road rage',
 'runway',
 'northbound',
 'scenicroute',
 'geometrid',
 'checkpoint',
 'the highway code',
 'multilane',
 'road',
 'ramp']

In [40]:
for i, cls in enumerate(labels):
    if 'walk frame' in cls:
        print(cls)

['baby walker', 'walk through', 'walkathon', 'warwalking', 'forwalk', 'hike', 'whistle walk', 'footwalk', 'walker', 'firewalk', 'racewalking', 'walk cane', 'crosswalk', 'stroll', 'walk', 'walkman', 'catwalk', 'knucklewalk', 'walk beat', 'random walk', 'walkable', 'stop walk', 'jaywalk', 'bushwalk', 'walk frame', 'walkabout', 'intentional walk', 'by walk', 'spacewalk', 'walk around']


In [41]:
def find_closest_cluster(word, labels):
    sim_score = []
    for label in labels:
        max_sim = 0
        for w in label:
            sim = get_sim(w, word)
            if sim > max_sim:
                max_sim = sim
        sim_score.append(max_sim)
    max_value = max(sim_score)
    max_index = sim_score.index(max_value)
    return max_index

In [42]:
len(corpus)

670

In [43]:
key_cid = dict()
for el in all_links:
    search_keys_cls = []
    for i, cls in enumerate(labels):
        if el in cls:
            search_keys_cls.append(i)
            key_cid[el] = i

    if not search_keys_cls:
        # find closest cluster
        print(el)
        key_cid[el] = find_closest_cluster(el, labels)
        print(key_cid[el])
#         break

dogs
26
checkpoints
3
plazas
24
excursions
18
mornings
26
skateboards
25
connecting
19
ramps
3
alleys
11
levels
5
lives
26
corridors
11
wards
2
factors
5
roads
3
pattens
0
crowded
21
connections
19
walking
8
speeding
25
types
21
footpaths
10
facilities
24
zones
24
underpasses
9
crosswalks
8
fatalities
1
congested
21
collisions
1
crossings
22
vehicles
25
legs
4
escalators
23
asphalted
3
boots
4
powers
5
womens
26
roadways
3
chances
21
lines
7
overpasses
9
guides
7
stairs
11
childs
21
neighborhoods
24
nights
26
ways
21
entrances
11
amenities
24
landscaped
13
sidewalks
12
protected
1
rails
23
parks
12
forces
5
signals
5
areas
24
paths
11
rights
11
running
4
tours
18
persons
21
motorized
25
movements
25
activities
13
trips
18
citys
24
highways
3
buildings
29
intoxicated
22
miles
2
exercises
4
pathways
11
precincts
24
thoroughfares
3
streets
3
bridges
29
students
21
interchanges
3
spaces
2
feats
21
lanes
3
junctions
29
enclosed
24
fundraising
7
adults
21
yorks
24
deaths
26
purposes
26
shelt

In [44]:
key_cid

{'cross street': 22,
 'stop walk': 8,
 'pavement': 3,
 'boardwalk': 3,
 'dogs': 26,
 'midfoot': 10,
 'roller shoe': 10,
 'pattern': 7,
 'totterer': 6,
 'motor': 25,
 'checkpoints': 3,
 'merit': 5,
 'approach': 7,
 'rambler': 0,
 'pedestrian crossing': 12,
 'plazas': 24,
 'europe': 17,
 'tour': 18,
 'staggerer': 6,
 'arterial': 4,
 'excursions': 18,
 'interstate': 3,
 'tread': 6,
 'bicycle': 15,
 'your foot': 20,
 'scenicroute': 3,
 'mile': 2,
 'unmotorized': 25,
 'use your leg': 4,
 'blindcurve': 6,
 'runway': 3,
 'mornings': 26,
 'death': 26,
 'limited-access': 19,
 'footbreadth': 10,
 'skateboards': 25,
 'canoe': 25,
 'obstruction': 1,
 'kilofoot': 10,
 'five toe': 20,
 'vehicular': 27,
 'floating bridge': 29,
 'deambulatory': 14,
 'quadrupedally': 14,
 'step in': 7,
 'nightrider': 6,
 'soul': 26,
 'undergroundrailway': 11,
 'bicyclerack': 15,
 'connecting': 19,
 'tube foot': 10,
 'obesity': 4,
 'trip': 18,
 'parader': 6,
 'control': 5,
 'nondriver': 25,
 'downtown': 24,
 'tollcollec

In [45]:
values = {i for i in key_cid if key_cid[i]==26}
values

{'air',
 'bed',
 'bustling',
 'character',
 'childhood',
 'day',
 'death',
 'deaths',
 'dogs',
 'drunk',
 'experience',
 'headon',
 'heart',
 'hour',
 'hours',
 'hov',
 'lameness',
 'life',
 'lives',
 'man',
 'morning',
 'mornings',
 'mountain lion',
 'night',
 'nights',
 'purposes',
 'quality',
 'rush hour',
 'solace',
 'soul',
 'statue',
 'time',
 'times',
 'verse',
 'woman',
 'women',
 'womens',
 'world',
 'year'}

In [46]:
org_keys = []
for ids, cls in enumerate(labels):
    values = list({i for i in key_cid if key_cid[i]==ids})
    org_keys.append(values)

In [47]:
labels[7]

['development',
 'fundraise',
 'copenhagenization',
 'guide',
 'dress',
 'fashion',
 'take step',
 'look both way',
 'separation',
 'take photo',
 'businessdistrict',
 'analysis',
 'at-grade',
 'line',
 'frontage',
 'reduction',
 'inclined plane',
 'emphasis',
 'approach',
 'relation',
 'enclose',
 'design',
 'step forward',
 'style',
 'step',
 'stepwise',
 'form',
 'passage',
 'step in',
 'pattern',
 'provision',
 'manner',
 'prose',
 'loop',
 'behavior',
 'flow',
 'study',
 'realism',
 'comparison',
 'survey']

In [48]:
org_keys[7]

['behavior',
 'patterns',
 'approach',
 'frontage',
 'step',
 'step forward',
 'development',
 'relation',
 'provision',
 'flow',
 'stepwise',
 'study',
 'dress',
 'fashion',
 'pattern',
 'realism',
 'comparison',
 'at-grade',
 'lines',
 'form',
 'look both way',
 'style',
 'passage',
 'reduction',
 'line',
 'analyses',
 'manner',
 'step in',
 'guide',
 'inclined plane',
 'separation',
 'prose',
 'take photo',
 'analysis',
 'businessdistrict',
 'loop',
 'emphasis',
 'design',
 'take step',
 'guides',
 'copenhagenization',
 'survey',
 'fundraising']

In [49]:
def df2_df1(df1, df2):
    cond = df2['f_name'].isin(df1['f_name'])
    df2.drop(df2[cond].index, inplace = True)
    return df2

In [50]:
dict_of_files = dict()
dict_of_idx =  dict()
root_dir = '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt'
for idxx, keys in enumerate(org_keys):
#     print(idxx, keys)
#     dict_of_files[idx]= []
    f_dict = dict()
    for k in keys:
        f_names = [f.rsplit('/', 1)[1] for f in all_links[k]]
        f_names = [f'{root_dir}/{f}.txt' for f in f_names if os.path.isfile(f'{root_dir}/{f}.txt')]
        f_dict[k] = pd.DataFrame({'f_name': f_names})
    
    sorted_key = []
    for k in sorted(f_dict, key=lambda k: len(f_dict[k])):
#         print(k, len(f_dict[k]))
        sorted_key.append(k)
    
    for idx, k in enumerate(sorted_key):
#         print(sorted_key[idx:], k)
        for el in sorted_key[idx:]:
            if el == k:
                continue
#             print(f'we are going to calc: {el} - {k}')
            f_dict[el] = df2_df1(f_dict[k], f_dict[el])
           
    
    # create id for each catagory
    dict_of_idx[idxx] = dict()
    # create dict of dict
    dict_of_files[idxx] = dict()
    for i, el in enumerate(f_dict):
        dict_of_idx[idxx][i] = el
        dict_of_files[idxx][i] = list(f_dict[el]['f_name'].tolist())      
        
#     break

In [51]:
dict_of_idx[1]

{0: 'gephyrophobia',
 1: 'injury',
 2: 'interurban',
 3: 'accident',
 4: 'fatalities',
 5: 'accidents',
 6: 'robinson',
 7: 'unbelted',
 8: 'violation',
 9: 'plaintiff',
 10: 'protected',
 11: 'handicapped',
 12: 'at bat',
 13: 'injuries',
 14: 'safety',
 15: 'advocacy',
 16: 'collision',
 17: 'obstruction',
 18: 'bark',
 19: 'negligence',
 20: 'collisions',
 21: 'lack',
 22: 'interference'}

In [52]:
labels[1]

['accident',
 'plaintiff',
 'handicapped',
 'injury',
 'robinson',
 'at bat',
 'violation',
 'advocacy',
 'protect',
 'safety',
 'obstruction',
 'negligence',
 'collision',
 'fatality',
 'lack',
 'interference',
 'interurban',
 'gephyrophobia',
 'unbelted',
 'bark']

In [53]:
len(dict_of_files)

30

In [54]:
len(dict_of_files[0])

17

In [55]:
dict_of_files[0]

{0: ['/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Robert_Barclay_Allardice.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Robert_Barclay_Allardice#Family.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Robert_Barclay_Allardice#Feats_of_Pedestrianism.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Robert_Barclay_Allardice#The_thousand-hour_walk.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Robert_Barclay_Allardice#Descent_of_the_title.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Pedestrianism.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Emma_Sharp.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Ury_House.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/James_Webster-Wedderburn.txt',
  '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Fritwell_Manor.txt',
  '/home/xtest/projects/text_proc

In [56]:
len(dict_of_files[0][11])

29

In [57]:
dict_of_files[0][11]

['/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Cat%27s_eye_(road).txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Philadelphia_Main_Line.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Retroreflector.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Horsham_Township,_Montgomery_County,_Pennsylvania.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Planned_unit_development.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Licking_Run_(Beaver_Run_tributary).txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Mont_Clare_Bridge.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/A_Catalogue_of_Crime.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Cleveland_Public_Parks_District.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/Zoning.txt',
 '/home/xtest/projects/text_processing/data/wiki_texts/ped_pt/History_of_ur

In [58]:
edit_line = "Our editors will review what you’ve submitted and determine whether to revise the article"

In [59]:
cluster_data = dict()
for cluster in dict_of_files:
#     if cluster == 10:
    cluster_data[cluster] = dict()
    docs = []
    categories = []

    s = 0
    for idx in dict_of_files[cluster]:

        s += len(dict_of_files[cluster][idx])

        for f_path in dict_of_files[cluster][idx]:
            with open(f_path, "r") as rf:
                fl = rf.read().splitlines()
                fl = [l.replace(edit_line, " ") if edit_line in l else l for l in fl]
                fl = [l.strip() for l in fl  if l != '']
                nl = concatenate_chunks(fl, 5)
                docs.extend(nl)
                cat = [idx] * len(nl)
                categories.extend(cat)
    cluster_data[cluster]['data'] = docs
    cluster_data[cluster]['target'] = categories
#     break

In [60]:
len(cluster_data[10]['data'])

11872

In [61]:
s = 0
for el in dict_of_files[10]:
    print(len(dict_of_files[10][el]))
    s += len(dict_of_files[10][el])

10
99
94
53
1
1
2
0
79
2
0
0
9
1
0
81
4
83
0
0
35
86
0
4
0
0
78
0
0
3
0
1
41


In [62]:
s

767

In [63]:
dict_of_idx[10]

{0: 'flatfoot',
 1: 'foot soldier',
 2: 'tube foot',
 3: 'roller shoe',
 4: 'clawfoot',
 5: 'footsore',
 6: 'footswitch',
 7: 'footstrike',
 8: 'dry foot',
 9: 'footpeg',
 10: 'kilofoot',
 11: 'megafoot',
 12: 'cleave foot',
 13: 'footwrap',
 14: 'foothalt',
 15: 'little toe',
 16: 'clubfoot',
 17: 'footpaths',
 18: 'footbreadth',
 19: 'footmeal',
 20: 'big toe',
 21: 'footstep',
 22: 'midfoot',
 23: 'hindfoot',
 24: 'clawfooted',
 25: 'footrail',
 26: 'footer',
 27: 'webfoot',
 28: 'bumblefoot',
 29: 'stairfoot',
 30: 'splayfoot',
 31: 'footboy',
 32: 'footpath'}

In [64]:
list(set(list(dict_of_idx[3].values())))

['intersection',
 'thoroughfare',
 'paved',
 'trafficisland',
 'roads',
 'the highway code',
 'earthbound',
 'trail',
 'four-lane',
 'roadway',
 'bridle path',
 'street',
 'multilane',
 'road',
 'scenicroute',
 'interchanges',
 'freeways',
 'southbound',
 'lanes',
 'eastbound',
 'checkpoints',
 'westbound',
 'roundabout',
 'streets',
 'road surface',
 'geometrid',
 'trafficlane',
 'drivearound',
 'highway',
 'three-lane',
 'pavementartist',
 'interstate',
 'lane',
 'boardwalk',
 'roadhog',
 'highways',
 'ramp',
 'route',
 'thoroughfares',
 'trafficcop',
 'roadways',
 'directional',
 'ramps',
 'trafficcircle',
 'interchange',
 'pavement',
 'road rage',
 'runway',
 'yellowline',
 'roadsurface',
 'streetsign',
 'one-lane',
 'pound pavement',
 'curbless',
 'trafficwarden',
 'northbound',
 'taxistand',
 'intersections',
 'routes',
 'frontage road',
 'asphalted',
 'two-lane',
 'freeway']

In [65]:
names = list(set(list(dict_of_idx[3].values())))
names

['intersection',
 'thoroughfare',
 'paved',
 'trafficisland',
 'roads',
 'the highway code',
 'earthbound',
 'trail',
 'four-lane',
 'roadway',
 'bridle path',
 'street',
 'multilane',
 'road',
 'scenicroute',
 'interchanges',
 'freeways',
 'southbound',
 'lanes',
 'eastbound',
 'checkpoints',
 'westbound',
 'roundabout',
 'streets',
 'road surface',
 'geometrid',
 'trafficlane',
 'drivearound',
 'highway',
 'three-lane',
 'pavementartist',
 'interstate',
 'lane',
 'boardwalk',
 'roadhog',
 'highways',
 'ramp',
 'route',
 'thoroughfares',
 'trafficcop',
 'roadways',
 'directional',
 'ramps',
 'trafficcircle',
 'interchange',
 'pavement',
 'road rage',
 'runway',
 'yellowline',
 'roadsurface',
 'streetsign',
 'one-lane',
 'pound pavement',
 'curbless',
 'trafficwarden',
 'northbound',
 'taxistand',
 'intersections',
 'routes',
 'frontage road',
 'asphalted',
 'two-lane',
 'freeway']

In [66]:
len(cluster_data[10]['data'])

11872

In [67]:
len(cluster_data[10]['target'])

11872

In [68]:
classes = [names[i] for i in cluster_data[10]["target"]]

In [69]:
len(classes)

11872

### Supervised

In [138]:
def get_supervised_topics():
    topics_dic = dict()
    for cid in range(num_clusters):
        print(cid)
        empty_dimensionality_model = BaseDimensionalityReduction()
        clf = LogisticRegression()
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
        vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
        topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8, umap_model=empty_dimensionality_model,
        hdbscan_model=clf, ctfidf_model=ctfidf_model)
        topics, probs = topic_model.fit_transform(cluster_data[cid]['data'], y=np.array(cluster_data[cid]['target']))
        for label in labels[cid]:
            ped_similar_topics_a, ped_similarity_a = topic_model.find_topics(label + " pedestrian", top_n=10)
            l = []
            for t in ped_similar_topics_a:
                for el in topic_model.get_topic(t):
                    l.append(el[0])
        topics_dic[cid] = list(set(l))
    return topics_dic

In [139]:
supervised_topics = get_supervised_topics()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [144]:
labels[0]

['skulk',
 'non',
 'tittup',
 'merit',
 'raik',
 'violation',
 'unpaved',
 'stalker',
 'sage',
 'pawn',
 'hov',
 'warn',
 'shamble',
 'shank nag',
 'waulk',
 'shank mare',
 'ward']

In [140]:
supervised_topics

{0: ['level',
  'johnson',
  'lakeview',
  'tunnel',
  'used',
  'crossing',
  'islands',
  'lafayette',
  'bryant park',
  'alcohol',
  'points',
  'opened',
  'us80',
  'wards',
  'warhol',
  'register historic places',
  'rack',
  'sirens',
  'signals',
  'installed',
  'zone',
  'clarita',
  'asphalt',
  'railway',
  'albany',
  'phoenix',
  'center',
  'north',
  'person',
  'rails',
  'drivers',
  '2007',
  'tokyo',
  'selma',
  'river',
  'public',
  'skunk ape',
  'social credit',
  'toro',
  'speed limits',
  'red light',
  'national',
  'washington heights',
  'merit',
  'joyce',
  'united',
  'broadcast',
  'left',
  'sr',
  'stop',
  'network',
  'speed',
  'boardwalk',
  'london',
  'bayona',
  'barnaby',
  'st',
  'limit',
  'triborough bridge',
  'miles',
  'laws',
  'areas',
  'hospital',
  'lincoln highway',
  'reported',
  'park',
  'road signs',
  'orphanage',
  'traffic',
  'sounds',
  'hov',
  'hexham',
  'thai',
  'tactile',
  'raitt',
  'county',
  'property',
  

In [125]:
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))

In [126]:
topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8, umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model)

In [127]:
topics, probs = topic_model.fit_transform(cluster_data[3]['data'], y=np.array(cluster_data[3]['target']))

In [128]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1487,0_talk_march 2019_persistent_vandalism
1,1,1471,1_talk_february 2008_georgia tech_article prom...
2,2,1250,2_buses_use_crossing_seattle
3,3,1152,3_carnival_havana_pierce_used
4,4,1128,4_putin_series_brando_butler
5,5,1101,5_acid_tesla_accused_panhandling
6,6,849,6_article_march 2007_utcreplyreply_promoted
7,7,675,7_swansea_canal_glenrothes_mcnair
8,8,661,8_new_bridges_theater_port authority
9,9,518,9_rights way_footpaths_segway_culdesac


In [134]:
labels[3]

['walkman',
 'walk frame',
 'knucklewalk',
 'intentional walk',
 'baby walker',
 'spacewalk',
 'catwalk',
 'footwalk',
 'walker',
 'by walk',
 'walk around',
 'forwalk',
 'walk through',
 'stroll',
 'walkathon',
 'bushwalk',
 'stop walk',
 'random walk',
 'walk in park',
 'firewalk',
 'walkable',
 'walkabout',
 'whistle walk',
 'walk beat',
 'warwalking',
 'jaywalk',
 'walk',
 'walk cane']

In [136]:
for label in labels[3]:
    ped_similar_topics_a, ped_similarity_a = topic_model.find_topics(label + " pedestrian", top_n=10)
    l = []
    for t in ped_similar_topics_a:
        for el in topic_model.get_topic(t):
            l.append(el[0])
print(set(l))

{'buses', '2016 201729in', 'footpath', 'confessions', 'seattle', 'agents', 'used', 'crossing', 'plate indicating', 'transport hubs', 'ampelmnnchen', 'crowd', 'grid', 'reduce', 'steps', 'mutcd', 'calming', 'arrow', 'semesters', 'red', 'signals', 'green', 'traffic lights', 'safety', 'pedestrian fatalities', 'introduced', 'naparima college', 'venice', 'oneway', 'washington park', 'tractor', 'obesity', 'pedestrians', 'jogger', 'ahead', 'feat', 'buttons', 'river', 'mad hatter', 'pedestrian crossing', 'footpaths', 'stretch', 'chinese', 'flashing', 'pontoon', 'bay run', 'covered states standards', '84', 'headphone', 'situations covered', 'ontario', 'meili', 'decreased 18 semesters', 'speed', 'crashes', '2008', 'viaduct', 'shared', 'car', 'use', 'carfree', 'riverside park', 'aps', 'law', 'traffic', 'alleys', 'gaits', 'high line', 'bicycle', 'tactile', 'reckoning', 'millennium', 'bridge', 'states', 'talk', 'street', 'shibuya', 'design', 'signs', 'bridges', 'form level', 'new', 'tetch', 'marina'

In [129]:
len(cluster_data[11]['data'])

22222

In [111]:
ped_similar_topics_a, ped_similarity_a = topic_model.find_topics("walking", top_n=38)
print(ped_similar_topics_a)

[1, 0, -1, 7, 11, 8, 9, 3, 4, 10, 5, 6, 2]


In [112]:
for t in ped_similar_topics_a:
    l = []
    print(t)
    for el in topic_model.get_topic(t):
        l.append(el[0])
    print(l)
#     print(ped_auto_model_a.get_topic(t))
    print('\n')

1
['displaystyle', 'tends', 'serverside', 'stressors', 'transportation encourages walking', 'general availability public', 'climate change internal', 'environment frequent exercise', 'obesity related medical', 'minutes operation engine']


0
['article', 'used', 'bridge', 'pedestrian', 'comments', '2019', 'public', 'park', 'including', 'south']


-1
['new', 'street', 'talk', 'article', 'used', 'time', 'people', 'construction', 'march', 'including']


7
['ahasuerus', 'eternal', 'writer', 'figure', 'cartaphilus', 'man', 'wandering jew appears', 'short story', 'appeared', 'marko']


11
['city', 'private', 'bahria', 'new', 'amenities', 'largest', 'people', 'communities mexico', 'number gated', 'middle class']


8
['total', 'following', 'following subcategories total', 'reflect recent changes', 'list reflect', 'changes category following', 'recent', 'total list', 'pages', 'switch games']


9
['pierces', 'new hampshire', 'new', 'buchanan', 'compromise', 'southern', 'administration', 'support'

In [105]:
hierarchical_topics = topic_model.hierarchical_topics(cluster_data[3]['data'])


100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:05<00:00,  5.09it/s]


In [107]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─pedestrian fatalities_calming_semesters_speed_carfree
│    ├─■──pedestrian fatalities_calming_carfree_speed_semesters ── Topic: 19
│    └─■──pedestrian fatalities_footpath_transport hubs_infrastructures_feat ── Topic: 25
└─talk_bridge_article_used_park
     ├─talk_new_article_march_used
     │    ├─talk_article_used_road_2019
     │    │    ├─march 2019_article_persistent_vandalism_semiprotection
     │    │    │    ├─■──talk_march 2019_persistent_vandalism_semiprotection ── Topic: 0
     │    │    │    └─■──article_march 2007_utcreplyreply_promoted_think ── Topic: 6
     │    │    └─new_park_used_time_traffic
     │    │         ├─new_street_time_people_article
     │    │         │    ├─new_park_used_article_including
     │    │         │    │    ├─■──song_jew_frontage_said_tomlinson ── Topic: 11
     │    │         │    │    └─new_park_used_article_people
     │    │         │    │         ├─used_article_people_park_years
     │    │         │    │         │    ├─new_baltimore_

## Guided Topic Modeling

In [141]:
def get_guided_topics():
    topics_dic = dict()
    for cid in range(num_clusters):
        print(cid)
        
        cluster_seed = list(set(labels[cid] + ['pedestrian']))
        
        
        vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
        topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8, 
                       seed_topic_list=[cluster_seed], nr_topics='auto')
        
        topics, probs = topic_model.fit_transform(cluster_data[cid]['data'])
        
        for label in labels[cid]:
            ped_similar_topics_a, ped_similarity_a = topic_model.find_topics(label + " pedestrian", top_n=10)
            l = []
            for t in ped_similar_topics_a:
                for el in topic_model.get_topic(t):
                    l.append(el[0])
        topics_dic[cid] = list(set(l))
    return topics_dic

In [142]:
guided_topics = get_guided_topics()

0
1
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	

In [145]:
labels[1]

['crowd',
 'scale',
 'power',
 'natural environment',
 'support',
 'creation',
 'control',
 'deambulatory',
 'speed',
 'density',
 'effect',
 'air',
 'study',
 'emphasis',
 'activity',
 'relation',
 'signal',
 'fashion',
 'energy',
 'design',
 'force',
 'building',
 'factor',
 'uninteresting',
 'system',
 'obesity',
 'pointduty',
 'stream',
 'unmotorized',
 'oncoming',
 'pattern',
 'environment',
 'naturestrip',
 'duty',
 'reduction',
 'behavior',
 'construction',
 'purpose',
 'geometrid',
 'circulation',
 'statue',
 'olympic games',
 'sight',
 'survey',
 'analysis',
 'task',
 'take photo',
 'enjoy nature',
 'busy',
 'view',
 'elevate',
 'style',
 'dress',
 'skill',
 'property',
 'at-grade',
 'line',
 'shopping',
 'cycle',
 'arterial',
 'base on ball',
 'variety',
 'guide',
 'recreational',
 'attention',
 'nature',
 'wind',
 'development',
 'type',
 'form']

In [143]:
guided_topics

{0: ['occupants',
  'astronaut',
  'used',
  'bump',
  'narrows sidesroad narrows',
  'rutherglen',
  'lane vehicles carrying',
  'cross',
  'wearing',
  'deflect',
  'harper',
  'fatality',
  'tim',
  'directional',
  'seat belt legislation',
  'payette',
  'median',
  'heavy vehicles exceeding',
  'concrete barrier',
  'tactile paving',
  'mandatory',
  'children',
  'lanes',
  'cambuslang',
  'farmeloan',
  'roadside',
  'straight',
  'pedestrian',
  'used traffic bottleneck',
  'constituency',
  'road dangerous',
  'vehicles',
  'ahead',
  'built',
  'cycling talk',
  'tenements',
  'edge',
  'blocks installed',
  'buttons',
  'timer',
  'cyclists',
  'graph',
  'exceeding 3500',
  'indicating actual',
  'national',
  'hill',
  'helmet law',
  'left',
  'oneway traffic pointing',
  'australia',
  'towns',
  'change road direction',
  'course priority road',
  'bhrf',
  'limit',
  'uneven',
  'use',
  'dalmarnock',
  'pedestrian signals',
  'march 2013',
  'indicating distance',
  '

In [146]:
augmented_topics = dict()
for el in supervised_topics:
    topics = list(set(supervised_topics[el] + guided_topics[el]))
    lem_words = []
    for ew in topics:
        w = lemmatizer.lemmatize(ew, get_wordnet_pos(ew))
        lem_words.append(w)
    augmented_topics[el] = lem_words

In [147]:
augmented_topics

{0: ['occupant',
  'astronaut',
  'level',
  'johnson',
  'injury',
  'lakeview',
  'tunnel',
  'use',
  'bump',
  'narrows sidesroad narrows',
  'cross',
  'island',
  'lafayette',
  'bryant park',
  'rutherglen',
  'lane vehicles carrying',
  'cross',
  'wear',
  'alcohol',
  'point',
  'deflect',
  'open',
  'harper',
  'us80',
  'fatality',
  'ward',
  'warhol',
  'tim',
  'directional',
  'register historic places',
  'seat belt legislation',
  'rack',
  'project',
  'payette',
  'median',
  'heavy vehicles exceeding',
  'concrete barrier',
  'siren',
  'signal',
  'tactile paving',
  'mandatory',
  'instal',
  'zone',
  'clarita',
  'asphalt',
  'railway',
  'child',
  'lane',
  'cambuslang',
  'albany',
  'farmeloan',
  'phoenix',
  'roadside',
  'straight',
  'center',
  'north',
  'pedestrian',
  'person',
  'used traffic bottleneck',
  'rail',
  'constituency',
  'driver',
  'road dangerous',
  'vehicle',
  '2007',
  'ahead',
  'built',
  'cycling talk',
  'tenement',
  'edge

In [161]:
def cluster_nodes(list_of_topics, num_clusters = 5):
    corpus_embeddings = embedder.encode(list_of_topics)

    clustering_model = KMeans(n_clusters=num_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_

    clustered_sentences = [[] for i in range(num_clusters)]
    for sentence_id, cluster_id in enumerate(cluster_assignment):
    #     print(f's-id: {sentence_id}, c-id: {cluster_id}')
        clustered_sentences[cluster_id].append(corpus[sentence_id])


    c_id = []
    cluster_elements = []
    # avg_score = []

    for i, cluster in enumerate(clustered_sentences):
#         print(cluster)
        c_id.append(i)
        cluster_elements.append(cluster)
    return cluster_elements

In [163]:
level2_dict = dict()
for idx in augmented_topics:
    cls = cluster_nodes(augmented_topics[idx])
    level2_dict[idx] = cls

In [164]:
level2_dict

{0: [['vagrant',
   'injury',
   'passenger',
   'tunnel',
   'landscape',
   'tittup',
   'knucklewalk',
   'protect',
   'corridor',
   'safety',
   'travel',
   'bipedalism',
   'crossstreet',
   'control',
   'intersection',
   'rightofway',
   'walkway',
   'distance',
   'businessdistrict',
   'car',
   'transportationsystem',
   'air',
   'enjoy scenery',
   'passable',
   'your foot',
   'signal',
   'runway',
   'student',
   'three-lane',
   'intoxicate',
   'carson',
   'design',
   'egress',
   'solace',
   'unpaved',
   'character'],
  ['approach',
   'peripatetic',
   'alleyway',
   'yellowline',
   'auto-free zones',
   'look both way',
   'crowd',
   'passer',
   'cyclist',
   'canoe',
   'bunion',
   'power',
   'amount',
   'natural environment',
   'passersby',
   'feetless',
   'creation',
   'wayfarer',
   'deambulatory',
   'traveler',
   'tramper',
   'maindrag',
   'step in',
   'safetyisland',
   'northbound',
   'equestrianism',
   'traffic',
   'bus-only',
  

In [167]:
list(set(list(dict_of_idx[3].values())))

['random walk',
 'firewalk',
 'walk in park',
 'footwalk',
 'spacewalk',
 'walkabout',
 'stroll',
 'bushwalk',
 'baby walker',
 'catwalk',
 'walker',
 'intentional walk',
 'walkman',
 'walk beat',
 'walkathon',
 'walk frame',
 'warwalking',
 'jaywalk',
 'walking',
 'by walk',
 'walk around',
 'forwalk',
 'walk',
 'walk cane',
 'whistle walk',
 'walkable',
 'walk through',
 'knucklewalk',
 'stop walk']

In [172]:
def get_id_for_child(idx, col):
    # col from 0 to 4
    # idx from 0 to 29
    cluster_len = 5
    return 30 + idx * cluster_len + col

## Force Directed data

In [173]:
get_id_for_child(0, 0)

30

In [178]:
nodes = []
links = []
nodes.append({'id': 'Pedestrian', 'group': 0, 'labels': 'Pedestrian'})
for idx in dict_of_idx:
    links.append({'source': 'Pedestrian', 'target': idx, 'value': 5})
    nodes.append({'id': idx, 'group': idx, 'labels': list(set(list(dict_of_idx[idx].values())))})
    for i, el in enumerate(level2_dict[idx]):
        nodes.append({'id': get_id_for_child(idx, i), 'group': idx, 'labels': el})
        links.append({'source': idx, 'target': get_id_for_child(idx, i), 'value': 3})

In [179]:
sample_data = dict()
sample_data['nodes'] = nodes
sample_data['links'] = links

In [181]:
json_object = json.dumps(sample_data, indent=4)


In [182]:
with open("sample.json", "w") as outfile:
    outfile.write(json_object)

## Tree Data

In [197]:
import random


In [198]:
clusters_labels = []
leaf_data = []
for idx in dict_of_idx:
    
    clusters_labels.append({'id': idx, 'cid': idx, 'labels': list(set(list(dict_of_idx[idx].values())))})
    for i, el in enumerate(level2_dict[idx]):
        
        leaf_data.append({'id': get_id_for_child(idx, i), 'cid': idx, 'labels': el, 'tid': i, 'coverage': random.random()})


In [199]:
sample_data = dict()
sample_data['clusters'] = clusters_labels
sample_data['leaves'] = leaf_data
json_object = json.dumps(sample_data, indent=4)


In [200]:
with open("tree_example.json", "w") as outfile:
    outfile.write(json_object)

## Flare data

In [201]:
flare_dict = dict()
flare_dict['name'] = 'Pedestrian'
flare_dict['children'] = []

In [203]:
for idx in dict_of_idx:
    
    childs = {'id': idx, 'cid': idx, 'name': idx, 'children': [], 'labels': list(set(list(dict_of_idx[idx].values())))}
    for i, el in enumerate(level2_dict[idx]):
    
        childs['children'].append({'id': get_id_for_child(idx, i), 'cid': idx, 'name': get_id_for_child(idx, i), 'labels': el, 'tid': i, 'size': random.random()})
    
    flare_dict['children'].append(childs)

In [204]:
json_object = json.dumps(flare_dict, indent=4)
with open("flare_example.json", "w") as outfile:
    outfile.write(json_object)

In [108]:
# get cluster seed
cluster_seed = list(set(list(dict_of_idx[3].values()) + ['pedestrian']))

In [109]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8, 
                       seed_topic_list=[cluster_seed], nr_topics='auto')
topics, probs = topic_model.fit_transform(cluster_data[3]['data'])

In [110]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,4085,-1_new_street_talk_article
1,0,9580,0_article_used_bridge_pedestrian
2,1,103,1_displaystyle_tends_serverside_stressors
3,2,88,2_candidate_candidate templateitn_candidatetem...
4,3,87,3_environmentinternational_armed conflicts_cri...
5,4,36,4_0999_talk_just_realm
6,5,28,5_listed_articles_listed project_project7 arti...
7,6,26,6_original_datetime view_view file appeared_pi...
8,7,25,7_ahasuerus_eternal_writer_figure
9,8,22,8_total_following_following subcategories tota...


In [113]:
ped_similar_topics_a, ped_similarity_a = topic_model.find_topics("walking", top_n=10)
print(ped_similar_topics_a)

[1, 0, -1, 7, 11, 8, 9, 3, 4, 10]


In [114]:
for t in ped_similar_topics_a:
    l = []
    print(t)
    for el in topic_model.get_topic(t):
        l.append(el[0])
    print(l)
#     print(ped_auto_model_a.get_topic(t))
    print('\n')

1
['displaystyle', 'tends', 'serverside', 'stressors', 'transportation encourages walking', 'general availability public', 'climate change internal', 'environment frequent exercise', 'obesity related medical', 'minutes operation engine']


0
['article', 'used', 'bridge', 'pedestrian', 'comments', '2019', 'public', 'park', 'including', 'south']


-1
['new', 'street', 'talk', 'article', 'used', 'time', 'people', 'construction', 'march', 'including']


7
['ahasuerus', 'eternal', 'writer', 'figure', 'cartaphilus', 'man', 'wandering jew appears', 'short story', 'appeared', 'marko']


11
['city', 'private', 'bahria', 'new', 'amenities', 'largest', 'people', 'communities mexico', 'number gated', 'middle class']


8
['total', 'following', 'following subcategories total', 'reflect recent changes', 'list reflect', 'changes category following', 'recent', 'total list', 'pages', 'switch games']


9
['pierces', 'new hampshire', 'new', 'buchanan', 'compromise', 'southern', 'administration', 'support'

# KeyBert

In [491]:
from keybert import KeyBERT

In [492]:
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(cluster_data[3]['data'])

In [493]:
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))

In [494]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), vocabulary=vocabulary)


In [495]:
topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8, 
                       nr_topics='auto')

In [496]:
topics, probs = topic_model.fit_transform(docs)



divide by zero encountered in divide


divide by zero encountered in divide



In [497]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,3891,-1_new_park_used_pedestrian
1,0,8096,0_road_new_used_station
2,1,115,1_displaystyle_equilibrium_paradox_tfrac
3,2,52,2_emissions_reduce_cooling_policies
4,3,46,3_velocity_acda_formula_stop
5,4,33,4_coordinates_folkestedetcoordinates_34_dutch
6,5,28,5_airbag_crash_occupants_safety
7,6,27,6_disabilities_access_election_devices
8,7,26,7_markerstate_highway_marker_state
9,8,22,8_audi_tdi_optional_20


In [499]:
ped_similar_topics_a, ped_similarity_a = topic_model.find_topics("walking", top_n=11)
print(ped_similar_topics_a)

[8, 5, -1, 14, 3, 0, 11, 2, 13, 6, 15]


In [500]:
ped_similar_topics_a.remove(-1)

In [501]:
for t in ped_similar_topics_a:
    l = []
    print(t)
    for el in topic_model.get_topic(t):
        l.append(el[0])
    print(l)
#     print(ped_auto_model_a.get_topic(t))
    print('\n')

8
['audi', 'tdi', 'optional', '20', 'new', 'torque', 'turbocharged', 'seats', 'transmission', 'manual']


5
['airbag', 'crash', 'occupants', 'safety', 'collisions', 'sensors', 'standard', 'volvo', 'states', 'designed']


14
['parking', 'priority', 'dangers', 'perpendicular', 'toll', 'variant', 'prohibition', 'end', 'deformations', 'pollute']


3
['velocity', 'acda', 'formula', 'stop', 'driver', 'curve', 'maximum', 'average', 'headlight', 'intersection']


0
['road', 'new', 'used', 'station', 'south', 'national', 'united', 'crossing', 'time', 'tunnel']


11
['asphalt', 'used', 'deposits', 'patent', 'production', 'temperature', 'canada', 'method', 'ancient', 'million']


2
['emissions', 'reduce', 'cooling', 'policies', 'costs', 'trees', '2021', 'eu', 'wind', 'technologies']


13
['chevron', 'bend', 'rightsharp', 'marker', 'postreflexion', 'yellowblack', 'roadschevron', 'way', 'danger', 'plate']


6
['disabilities', 'access', 'election', 'devices', 'united', 'students', 'guidelines', 'ada

In [None]:
new_f = list(set(files[10]))
docs_p = []
for fn in new_f:
    with open(fn, "r") as rf:
        fl = rf.read().splitlines()
        fl = [l.replace(edit_line, " ") if edit_line in l else l for l in fl]
        fl = [l.strip() for l in fl  if l != '']
        nl = concatenate_chunks(fl, 5)
        docs_p.extend(nl)

In [317]:
unsorted_key = org_keys[7]
root_dir = '/home/xtest/text_processing/data/wiki_texts/ped_pt'
f_dict = dict()
for k in unsorted_key:
    f_names = [f.rsplit('/', 1)[1] for f in all_links[k]]
    f_names = [f'{root_dir}/{f}.txt' for f in f_names if os.path.isfile(f'{root_dir}/{f}.txt')]
    f_dict[k] = pd.DataFrame({'f_name': f_names})
    print(k, len(f_names))


bed 100
zones 100
west 100
side 100
zone 100
south 100
westbound 100
southbound 100
east-west 100
mornings 100
northbound 100
carson 100
morning 100
auto-free zones 100
eastbound 100
north-south 100


In [318]:
sorted_key = []
for k in sorted(f_dict, key=lambda k: len(f_dict[k])):
    print(k, len(f_dict[k]))
    sorted_key.append(k)

bed 100
zones 100
west 100
side 100
zone 100
south 100
westbound 100
southbound 100
east-west 100
mornings 100
northbound 100
carson 100
morning 100
auto-free zones 100
eastbound 100
north-south 100


In [319]:
for idx, k in enumerate(sorted_key):
    print(sorted_key[idx:], k)
    for el in sorted_key[idx:]:
        if el == k:
            continue
        print(f'we are going to calc: {el} - {k}')
        f_dict[el] = df2_df1(f_dict[k], f_dict[el])
#     break

['bed', 'zones', 'west', 'side', 'zone', 'south', 'westbound', 'southbound', 'east-west', 'mornings', 'northbound', 'carson', 'morning', 'auto-free zones', 'eastbound', 'north-south'] bed
we are going to calc: zones - bed
we are going to calc: west - bed
we are going to calc: side - bed
we are going to calc: zone - bed
we are going to calc: south - bed
we are going to calc: westbound - bed
we are going to calc: southbound - bed
we are going to calc: east-west - bed
we are going to calc: mornings - bed
we are going to calc: northbound - bed
we are going to calc: carson - bed
we are going to calc: morning - bed
we are going to calc: auto-free zones - bed
we are going to calc: eastbound - bed
we are going to calc: north-south - bed
['zones', 'west', 'side', 'zone', 'south', 'westbound', 'southbound', 'east-west', 'mornings', 'northbound', 'carson', 'morning', 'auto-free zones', 'eastbound', 'north-south'] zones
we are going to calc: west - zones
we are going to calc: side - zones
we are g

In [320]:
sorted_key = []
for k in sorted(f_dict, key=lambda k: len(f_dict[k])):
    print(k, len(f_dict[k]))
    sorted_key.append(k)

zone 13
morning 13
eastbound 29
northbound 37
auto-free zones 40
east-west 42
north-south 43
mornings 62
south 66
southbound 69
side 74
west 84
westbound 86
carson 97
zones 98
bed 100


In [321]:
f_dict['bed']['f_name'].tolist()

['/home/xtest/text_processing/data/wiki_texts/ped_pt/Arroyo_(creek).txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Tyne_cyclist_and_pedestrian_tunnels.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Hostile_architecture.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Rumble_strip.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Culvert.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Ada_Anderson.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Glossary_of_road_transport_terms.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Road_surface_marking.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Airbag.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Bumper_(car).txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Smartphone_zombie.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Fullpower_Technologies.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Volume_One:_UnIndi

In [195]:
df_1 = pd.DataFrame({'c_id': c_id, 'cluster':labels, 'json_key':org_keys , 'score':avg_score})
df_1 = df_1.sort_values(by=['score'], ascending=False)
df_1

Unnamed: 0,c_id,cluster,json_key,score
4,4,"[promenade, pedestrianize, thoroughfare, pedes...","[pedestrians, pedestriantraffic, pedestrianism...",0.708186
16,16,"[walk frame, stroll, knucklewalk, forwalk, foo...","[walkabout, walk cane, stroll, intentional wal...",0.407751
8,8,"[pelican crossing, cross, crossstreet, interfe...","[crosswalks, pelican crossing, crossing, cross...",0.400158
26,26,"[rightofway, trafficwarden, dual carriageway, ...","[road rage, sidewalks, roadsurface, trafficcop...",0.349862
9,9,"[travel, travel to destination, subway, bus, c...","[transportationsystem, bus-only, commuter trai...",0.338677
10,10,"[trolleycar, car, vehicular traffic, seat belt...","[vehiculartraffic, car, trolleycar, seat belt,...",0.332366
20,20,"[horse gait, bipedal, bicycle, trek, dave kuns...","[dave kunst, bimble, horse gait, bipedalism, t...",0.310068
17,17,"[handicapped, advocacy, fatality, accident, ch...","[plaintiff, injuries, accident, charitable org...",0.29737
12,12,"[motorist, driver, auto, passenger, carpooler,...","[drivearound, mobility scooter, nondriver, mot...",0.286285
27,27,"[stomper, staggerer, wayfarer, stamper, hardsh...","[plodder, hiker, stomp, stomper, staggerer, tr...",0.284224


In [199]:
len(org_keys)

30

In [None]:

file_names = [f.rsplit('/', 1)[1] for f in f_names]
l_split, r_split = l.rsplit('/', 1)
if r_split == '':
    file_name = l_split.rsplit('/', 1)[1]
else:
    file_name = l.rsplit('/', 1)[1]
if file_name + '.txt' not in file_names:
    print(f"retrive {l, file_name}")
    retrive(l, file_name, dataset=dataset)

In [208]:
file_name = 'Museum_Campus'
os.path.isfile(f'/home/xtest/text_processing/data/wiki_texts/ped_pt/{file_name}.txt')

True

In [405]:
root_dir = '/home/xtest/text_processing/data/wiki_texts/ped_pt'
files = []
for cls in org_keys:
    f_names = []
    for fn in cls:
        f_names.extend(all_links[fn])
    f_names = [f.rsplit('/', 1)[1] for f in f_names]
    f_names = [f'{root_dir}/{f}.txt' for f in f_names if os.path.isfile(f'{root_dir}/{f}.txt')]
    files.append(list(set(f_names)))

In [406]:
files[10]

['/home/xtest/text_processing/data/wiki_texts/ped_pt/Braess%27s_paradox.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Trolleybus.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Quinebaug_River.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Footbridge.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Road_signs_in_Russia.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Segregated%20cycle%20facilities.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Defensive_driving.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Risk_compensation.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Personal_rapid_transit.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Car.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/Epidemiology_of_motor_vehicle_collisions.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/501_Queen.txt',
 '/home/xtest/text_processing/data/wiki_texts/ped_pt/List_of_countri

In [213]:
for f in files:
    print(len(f))

725
1580
1261
1986
410
1442
1598
953
437
1086
474
1948
864
1044
1066
1311
1465
572
1222
1823
584
1599
806
953
398
1115
1792
1471
553
1584


In [214]:
df_1 = pd.DataFrame({'c_id': c_id, 'cluster':labels, 'json_key':org_keys, 'files':files, 'score':avg_score})
df_1 = df_1.sort_values(by=['score'], ascending=False)
df_1

Unnamed: 0,c_id,cluster,json_key,files,score
4,4,"[promenade, pedestrianize, thoroughfare, pedes...","[pedestrians, pedestriantraffic, pedestrianism...",[/home/xtest/text_processing/data/wiki_tex...,0.708186
16,16,"[walk frame, stroll, knucklewalk, forwalk, foo...","[walkabout, walk cane, stroll, intentional wal...",[/home/xtest/text_processing/data/wiki_tex...,0.407751
8,8,"[pelican crossing, cross, crossstreet, interfe...","[crosswalks, pelican crossing, crossing, cross...",[/home/xtest/text_processing/data/wiki_tex...,0.400158
26,26,"[rightofway, trafficwarden, dual carriageway, ...","[road rage, sidewalks, roadsurface, trafficcop...",[/home/xtest/text_processing/data/wiki_tex...,0.349862
9,9,"[travel, travel to destination, subway, bus, c...","[transportationsystem, bus-only, commuter trai...",[/home/xtest/text_processing/data/wiki_tex...,0.338677
10,10,"[trolleycar, car, vehicular traffic, seat belt...","[vehiculartraffic, car, trolleycar, seat belt,...",[/home/xtest/text_processing/data/wiki_tex...,0.332366
20,20,"[horse gait, bipedal, bicycle, trek, dave kuns...","[dave kunst, bimble, horse gait, bipedalism, t...",[/home/xtest/text_processing/data/wiki_tex...,0.310068
17,17,"[handicapped, advocacy, fatality, accident, ch...","[plaintiff, injuries, accident, charitable org...",[/home/xtest/text_processing/data/wiki_tex...,0.29737
12,12,"[motorist, driver, auto, passenger, carpooler,...","[drivearound, mobility scooter, nondriver, mot...",[/home/xtest/text_processing/data/wiki_tex...,0.286285
27,27,"[stomper, staggerer, wayfarer, stamper, hardsh...","[plodder, hiker, stomp, stomper, staggerer, tr...",[/home/xtest/text_processing/data/wiki_tex...,0.284224


In [215]:
df_1.to_pickle(f'ped_cluster.pk')

In [218]:
edit_line = "Our editors will review what you’ve submitted and determine whether to revise the article"

In [418]:
len(new_f)

474

In [407]:
new_f = list(set(files[10]))
docs_p = []
for fn in new_f:
    with open(fn, "r") as rf:
        fl = rf.read().splitlines()
        fl = [l.replace(edit_line, " ") if edit_line in l else l for l in fl]
        fl = [l.strip() for l in fl  if l != '']
        nl = concatenate_chunks(fl, 5)
        docs_p.extend(nl)

In [423]:
len(docs_p)

4634

In [408]:
print([labels[10] ])

[['trolleycar', 'car', 'vehicular traffic', 'seat belt', 'vehicle', 'vehiculartraffic', 'vehicular', 'canoe', 'travel by car', 'vehiculation', 'automobile']]


In [409]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))

In [410]:
topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8, 
                       seed_topic_list=[['trolleycar', 'car', 'vehicular traffic', 'seat belt', 
                                         'vehicle', 'vehiculartraffic', 'vehicular', 'canoe', 
                                         'travel by car', 'vehiculation', 'automobile']], nr_topics='auto')

In [412]:
topics, probs = topic_model.fit_transform(docs_p)

In [413]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1505,-1_used_new_safety_park
1,0,406,0_transit_avenue_new_station
2,1,132,1_left_yellow_stop_straight
3,2,128,2_ford_trim_available_xlt
4,3,94,3_weight_entry_prohibition_used
...,...,...,...
88,87,11,87_alaskan_ngi_potlatch_lower
89,88,11,88_zoning_private_dependency_american
90,89,10,89_path_compulsory track pedestrians_und_used ...
91,90,10,90_isa_vehicle_systems_beacons


In [452]:
ped_similar_topics_a, ped_similarity_a = topic_model.find_topics("vehicle", top_n=10)
print(ped_similar_topics_a)

[7, 1, 6, 10, 4, 5, 0, 3, 2, 8]


In [453]:
for t in ped_similar_topics_a:
    l = []
    print(t)
    for el in topic_model.get_topic(t):
        l.append(el[0])
    print(l)
#     print(ped_auto_model_a.get_topic(t))
    print('\n')

7
['suv', 'articles', 'july 2008', 'bridges', 'university', 'headlamps', 'unibody', 'infobox', 'selfdriving', 'fourwheel']


1
['safety', 'standard', 'lights', 'available', 'volvo', 'models', 'elevators', 'bus', 'expedition', 'states']


6
['acda', 'selfdriving', 'vehicles', 'safe speed', 'rule', 'rojas', 'clear distance', 'automation', 'assured', 'sight']


10
['s1 gard', 'right wheels', 'pts', 'bernoullis principle', 'object greater', 'pedestrians killed motor', 'involving rear right', 'wheels transit buses', 'curved', 'category']


4


5
['transport', 'prt', 'ring', 'cities', 'frontage roads', 'hierarchy', 'active', 'emissions', 'cyclists', 'public']


0
['used', 'pedestrian', 'sign', 'roads', 'limit', 'right', 'red', 'safety', 'united', 'signals']


3
['streetcars', 'bridge', 'light rail', 'new', 'tram', 'ttc', 'tracks', 'san', 'began', 'systems']


2
['water', 'duluth', 'trails', 'built', 'tanana', 'pontoon', 'canal', 'centre', 'airport', 'point']


8
['toll roads', 'layer', 'maca

In [285]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))
docs = data["data"]
categories = data["target"]
category_names = data["target_names"]

In [292]:
type(data["data"])

list

In [293]:
data["data"][0]

"\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [294]:
category_names[0]

'alt.atheism'

In [287]:
len(categories)

18846

In [288]:
categories

array([10,  3, 17, ...,  3,  1,  7])

In [286]:
len(docs)

18846

In [283]:
topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8
                       , nr_topics='auto')

In [284]:
topics, probs = topic_model.fit_transform(docs_p, y=labels[0])

ValueError: Length of x = 5917, length of y = 12, while it must be equal.