In [1]:
import glob
import pandas as pd
import json
# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
# import these modules
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# from bertopic import BERTopic
import numpy as np

lemmatizer = WordNetLemmatizer()

In [2]:
import pickle

In [3]:
from bertopic import BERTopic

In [4]:
from sentence_transformers import SentenceTransformer


In [5]:
import os
from tqdm.notebook import tqdm, trange

In [6]:
def get_avg(file_names):
    count_f = 0
    sum_length = 0
    for fn in file_names:
        ln = get_length(fn)
        sum_length = sum_length + ln
        if ln > 0:
            count_f = count_f + 1
    print(f"your avg is: {sum_length/count_f}")

In [7]:
def get_length(fn):
    ln = 0
    with open(fn, "r") as rf:
        r_text = rf.readlines()
        ln = len(r_text)
    return ln

In [8]:
! ls -1 ~/text_processing/data/wiki_texts/ped_pt | wc -l

13615


In [9]:
! ls -1 ~/text_processing/data/britanica_texts/ped_pt | wc -l

1003


In [10]:
file_names_wi_p = [f for f in glob.glob("../text_processing/data/wiki_texts/ped_pt/*.txt")]
print(len(file_names_wi_p))

13615


In [11]:
file_names_br_p = [f for f in glob.glob("../text_processing/data/britanica_texts/ped_pt/*.txt")]
print(len(file_names_br_p))

1003


In [12]:
edit_line = "Our editors will review what you’ve submitted and determine whether to revise the article"

In [None]:
# list comprehension joining every two elements together in a list
# [ ''.join(x) for x in zip(lst[0::2], lst[1::2]) ]

In [114]:
not_stemmed_word = pd.read_csv("../text_processing/ped_not_stemmed_word_based_wiki_score.csv")
check_for = not_stemmed_word["word"].tolist()

In [117]:
def check_sentence(lines):
    fl = []
    fl.extend(lines[0:25])
    for l in lines[25:]:
        if any(ele in l for ele in check_for):
            fl.append(l)
    return fl

In [160]:
def concatenate_chunks(string_list, chunk_size):
    if chunk_size > len(string_list):
        return [" ".join(string_list)]
    else:
        chunks = []
        for i in range(0, len(string_list), chunk_size):
            chunk = "".join(string_list[i:i + chunk_size])
            chunks.append(chunk)
        return chunks

In [193]:
docs_p = []
for fn in file_names_wi_p:
    with open(fn, "r") as rf:
        fl = rf.read().splitlines()
        fl = [l.replace(edit_line, " ").strip() if edit_line in l else l for l in fl]
        fl = [l.strip() for l in fl  if l != '']
    docs_p.extend(fl)

In [194]:
len(docs_p)

490129

In [195]:
for fn in file_names_br_p:
    with open(fn, "r") as rf:
        fl = rf.read().splitlines()
        fl = [l.replace(edit_line, " ") if edit_line in l else l for l in fl]
        fl = [l.strip() for l in fl  if l != '']
        nl = concatenate_chunks(fl, 75)
        docs_p.extend(nl)

In [196]:
len(docs_p)

869061

In [198]:
docs_p[490120]

'The hands on experience at Safetyville includes pushing buttons, and learning to look both ways before crossing the street while using the working street lights and crosswalk lights. Before watching a video at the "fire station" the tour guide engages the children by asking questions and there is a "quiz" after the video. There are many opportunities for questions and answers and for children volunteers to help throughout the tour. Additionally, the children are taught how to make a real 911 call, and they practice dialing the correct number and how they would talk to a 911 operator. They are taught how to "stop, drop, and roll" during the fire safety portion of the session, and they practice the technique on the spot.[8]'

In [192]:
len(docs_p)

490129

In [199]:
from sklearn.feature_extraction.text import CountVectorizer


In [200]:
vectorizer_model = CountVectorizer(stop_words="english")

In [201]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

In [202]:
embeddings = sentence_model.encode(docs_p)

In [203]:
type(embeddings)

numpy.ndarray

In [204]:
with open('/raid/AISSEL/xtest/datasets/ped_wiki_embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': docs_p, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [206]:
!du -sh /raid/AISSEL/xtest/datasets/ped_wiki_embeddings.pkl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
3.0G	/raid/AISSEL/xtest/datasets/ped_wiki_embeddings.pkl


In [219]:
topic_model = BERTopic(vectorizer_model=vectorizer_model, diversity=0.8, nr_topics=20)

In [None]:
topics, probs = topic_model.fit_transform(docs_p, embeddings)

In [221]:
hierarchical_topics = topic_model.hierarchical_topics(docs_p)


100%|███████████████████████████████████████████████████████████| 19/19 [00:00<00:00, 43.50it/s]


In [222]:
hierarchical_topics

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
18,38,said_mr_little_did_man,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",36,utcreplyreply_bridge_talk_new_film,37,said_mr_little_did_man,1.591881
17,37,said_mr_little_did_man,"[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 14, 15]",35,said_mr_did_little_man,23,winchelsea_miss_fanny_snooks_rome,1.080854
16,36,utcreplyreply_bridge_talk_new_film,"[5, 12, 13, 16, 17, 18, 19]",33,bridge_film_new_team_city,32,utcreplyreply_talk_promoted_file_modified,1.05648
15,35,said_mr_did_little_man,"[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 14]",31,heathcliff_linton_catherine_said_ill,34,said_man_little_like_christian,1.018936
14,34,said_man_little_like_christian,"[1, 2, 3, 7, 9, 10, 11]",30,said_man_little_like_christian,9,evesham_lady_man_said_went,0.888457
13,33,bridge_film_new_team_city,"[12, 13, 17, 18, 19]",29,bridge_bicycle_bike_new_city,28,film_team_league_cape_season,0.853451
12,32,utcreplyreply_talk_promoted_file_modified,"[5, 16]",5,utcreplyreply_promoted_talk_voting_comments,16,file_modified_page_links_external,0.819635
11,31,heathcliff_linton_catherine_said_ill,"[0, 4, 8, 14]",14,ay_linton_yah_talk_asked,27,heathcliff_linton_catherine_said_ill,0.817342
10,30,said_man_little_like_christian,"[1, 2, 3, 7, 10, 11]",21,went_like_ulrich_old_did,26,said_man_little_christian_mr,0.76846
9,29,bridge_bicycle_bike_new_city,"[13, 17, 19]",19,bridge_bridges_new_construction_span,25,bicycle_bike_border_bicycles_city,0.746237


In [223]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─utcreplyreply_bridge_talk_new_film
│    ├─bridge_film_new_team_city
│    │    ├─bridge_bicycle_bike_new_city
│    │    │    ├─■──bridge_bridges_new_construction_span ── Topic: 19
│    │    │    └─bicycle_bike_border_bicycles_city
│    │    │         ├─■──border_visa_countries_entry_russian ── Topic: 13
│    │    │         └─■──bicycle_bike_bicycles_bikes_cycling ── Topic: 17
│    │    └─film_team_league_cape_season
│    │         ├─■──team_league_cape_stadium_teams ── Topic: 12
│    │         └─■──film_series_episode_films_batman ── Topic: 18
│    └─utcreplyreply_talk_promoted_file_modified
│         ├─■──utcreplyreply_promoted_talk_voting_comments ── Topic: 5
│         └─■──file_modified_page_links_external ── Topic: 16
└─said_mr_little_did_man
     ├─said_mr_did_little_man
     │    ├─heathcliff_linton_catherine_said_ill
     │    │    ├─■──ay_linton_yah_talk_asked ── Topic: 14
     │    │    └─heathcliff_linton_catherine_said_ill
     │    │         ├─heathcliff_linton_catherine

In [212]:
print(tree)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

