In [1]:
from keybert import KeyBERT
import whisper_timestamped as whisper
from nltk import download
from nltk.corpus import stopwords

download("stopwords")
YT_LINK = "/content/vid test.weba"

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Hardcoded text

In [2]:
with open("content/oleg.txt", "r", encoding="utf-8") as f:
    txt = f.read()

# NLP

In [3]:
from transformers import BertModel, BertTokenizerFast
from datasets import Dataset
import torch
import torch.nn.functional as F
import pandas as pd

In [4]:
model_checkpoint = 'bert-base-multilingual-cased'

tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)
model = BertModel.from_pretrained(model_checkpoint)
model = model.eval()


def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.pooler_output
    normalized_embeddings = F.normalize(embeddings, p=2)

    return normalized_embeddings

In [44]:
kw_model = KeyBERT()

tokens = txt.split(" ")
indexed_keywords = {}
df = pd.DataFrame(columns=["index", "keywords"])
for i in range(200, len(tokens), 200):
    frag = tokens[i - 200: i]
    frag = " ".join(frag)

    doc_embeddings, word_embeddings = kw_model.extract_embeddings(frag)
    keywords = kw_model.extract_keywords(frag,
                                         doc_embeddings=doc_embeddings,
                                         word_embeddings=word_embeddings)
    indexed_keywords[i] = keywords

In [45]:
values_dict = {"index": [], "frags": []}
for i, keywords in indexed_keywords.items():
    words = " ".join([k[0] for k in keywords])

    values_dict["index"].append(i)
    values_dict["frags"].append(words)

In [46]:
df = pd.DataFrame(values_dict)
df.head(5)

Unnamed: 0,index,frags
0,200,покидаю алгоритм годы комьюнити приятно
1,400,передовой школой сотрудничества подготовки сейчас
2,600,экспертизы экспертиза монокорпоративную формат...
3,800,формируется алгоритм компания продуктовыми про...
4,1000,инженерами доследовательской командной возможн...


In [47]:
words_dataset = Dataset.from_pandas(df)
words_dataset = words_dataset.map(
    lambda x: {'text_embeddings': get_embeddings(x["frags"])[0]}
)

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

In [48]:
words_dataset.to_pandas().head()

Unnamed: 0,index,frags,text_embeddings
0,200,покидаю алгоритм годы комьюнити приятно,"[0.028846072, -0.022257075, 0.025537428, -0.01..."
1,400,передовой школой сотрудничества подготовки сейчас,"[0.035737332, -0.02563613, 0.025720645, -0.023..."
2,600,экспертизы экспертиза монокорпоративную формат...,"[0.032574058, -0.0206053, 0.020763565, -0.0272..."
3,800,формируется алгоритм компания продуктовыми про...,"[0.023455705, -0.005366963, 0.022595434, -0.01..."
4,1000,инженерами доследовательской командной возможн...,"[0.026208678, -0.0130467275, 0.014915336, -0.0..."


In [49]:
words_dataset = words_dataset.add_faiss_index(column='text_embeddings')

  0%|          | 0/1 [00:00<?, ?it/s]

In [55]:
test_frag_1 = words_dataset.to_pandas().head()['text_embeddings'][1]
words_dataset.to_pandas().head()

Unnamed: 0,index,frags,text_embeddings
0,200,покидаю алгоритм годы комьюнити приятно,"[0.028846072, -0.022257075, 0.025537428, -0.01..."
1,400,передовой школой сотрудничества подготовки сейчас,"[0.035737332, -0.02563613, 0.025720645, -0.023..."
2,600,экспертизы экспертиза монокорпоративную формат...,"[0.032574058, -0.0206053, 0.020763565, -0.0272..."
3,800,формируется алгоритм компания продуктовыми про...,"[0.023455705, -0.005366963, 0.022595434, -0.01..."
4,1000,инженерами доследовательской командной возможн...,"[0.026208678, -0.0130467275, 0.014915336, -0.0..."


In [63]:
indexed_keywords[200]

[('покидаю', 0.3988),
 ('алгоритм', 0.3596),
 ('годы', 0.3579),
 ('комьюнити', 0.3568),
 ('приятно', 0.3564)]

In [64]:
scores, samples = words_dataset.get_nearest_examples('text_embeddings', test_frag_1, k=15)
demo_df = pd.DataFrame({"index": samples['index']})
demo_df["score_to_1"] = scores
demo_df["keywords"] = demo_df["index"].apply(lambda x: [w[0] for w in indexed_keywords[x]])

In [65]:
demo_df

Unnamed: 0,index,score_to_5,keywords
0,400,0.0,"[передовой, школой, сотрудничества, подготовки..."
1,3000,0.043668,"[профилю, роли, образовательный, хотели, биоте..."
2,3800,0.043825,"[взаимодействия, разной, конверсия, поисковике..."
3,2200,0.045507,"[нанимающими, какой, требования, компаниях, ко..."
4,600,0.046161,"[экспертизы, экспертиза, монокорпоративную, фо..."
5,1600,0.049893,"[взаимодействия, который, эксперты, исходим, с..."
6,1000,0.057495,"[инженерами, доследовательской, командной, воз..."
7,800,0.062978,"[формируется, алгоритм, компания, продуктовыми..."
8,1400,0.063792,"[новосибирск, инструментария, бизнес, бизнесом..."
9,1800,0.066138,"[помогают, программировали, приходит, возможно..."


## Sentence transformers

In [96]:
with open("content/test_text.txt", "r", encoding="utf-8") as f:
    txt = f.read()

In [97]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

sentences = txt.split(".")
embeddings = model.encode(sentences)

In [98]:
def get_sentence_embeddings(sentence):
    embeddings = model.encode(sentence)
    return embeddings

In [99]:
values_dict = {"index":[], "sentence": [], "embedding": []}
for i, sentence in enumerate(sentences):
    values_dict["index"].append(i)
    values_dict["sentence"].append(sentence.strip())
    values_dict["embedding"].append(get_sentence_embeddings(sentence))


In [100]:
df = pd.DataFrame(values_dict)
df.head(5)

Unnamed: 0,index,sentence,embedding
0,0,"Once upon a time, in a faraway kingdom, there ...","[-0.041774653, 0.0841553, -0.008771699, 0.0614..."
1,1,"He was known for his bravery and kindness, tra...","[0.038933642, 0.11603221, -0.0348304, 0.029169..."
2,2,"But little did he know, his life was about to ...","[0.07921023, 0.07200449, 0.012013881, 0.044593..."
3,3,As Alexander rode through the forest on his tr...,"[0.06117729, 0.020715937, -0.016838152, 0.0684..."
4,4,"Curiosity piqued, he dismounted and approached...","[0.019852942, 0.08398955, -0.010187379, 0.0162..."


In [101]:
sentence_dataset = Dataset.from_pandas(df)
sentence_dataset = sentence_dataset.add_faiss_index(column='embedding')

  0%|          | 0/1 [00:00<?, ?it/s]

In [102]:
test_frag_1 = sentence_dataset.to_pandas().head()['embedding'][1]
sentence_dataset.to_pandas().head()

Unnamed: 0,index,sentence,embedding
0,0,"Once upon a time, in a faraway kingdom, there ...","[-0.041774653, 0.0841553, -0.008771699, 0.0614..."
1,1,"He was known for his bravery and kindness, tra...","[0.038933642, 0.11603221, -0.0348304, 0.029169..."
2,2,"But little did he know, his life was about to ...","[0.07921023, 0.07200449, 0.012013881, 0.044593..."
3,3,As Alexander rode through the forest on his tr...,"[0.06117729, 0.020715937, -0.016838152, 0.0684..."
4,4,"Curiosity piqued, he dismounted and approached...","[0.019852942, 0.08398955, -0.010187379, 0.0162..."


In [103]:
scores, samples = sentence_dataset.get_nearest_examples('embedding', test_frag_1, k=100)
demo_df = pd.DataFrame({"index": samples['index']})
demo_df["score_to_1"] = scores

In [104]:
demo_df

Unnamed: 0,index,score_to_1
0,1,0.0
1,12,0.734333
2,18,0.877443
3,14,1.026736
4,2,1.137291
5,11,1.312388
6,17,1.351491
7,13,1.392522
8,16,1.398336
9,6,1.407156


## Sentence transformers

In [117]:
with open("content/test_text.txt", "r", encoding="utf-8") as f:
    txt = f.read()

In [118]:
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

In [119]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

sentences = txt.split(".")
sentences = [s.strip() for s in sentences if len(s) > 0]
embeddings = model.encode(sentences)

In [120]:
num_clusters = 4
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

In [121]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['Inside the cottage stood a beautiful enchantress, her eyes sparkling with magic', 'She introduced herself as Elara and revealed that she had been expecting him']

Cluster  2
['Once upon a time, in a faraway kingdom, there lived a young prince named Alexander', 'As Alexander rode through the forest on his trusty steed, he stumbled upon a mysterious old cottage hidden amongst the trees', 'Curiosity piqued, he dismounted and approached cautiously', 'The cottage seemed abandoned, yet an eerie aura surrounded it, sending shivers down his spine', 'As Alexander pushed open the creaky door, he was greeted by a sight that left him speechless', 'Elara explained that Alexander was the chosen one, destined to embark on a quest to save the kingdom from an ancient evil that threatened to engulf it', 'As he traversed mountains and crossed treacherous seas, Alexander faced countless trials and tribulations', 'At long last, Alexander reached the heart of darkness, where the malevolent forc

In [122]:
values_dict = {"index":[], "sentence": []}
for i, sentence in enumerate(sentences):
    values_dict["index"].append(i)
    values_dict["sentence"].append(sentence.strip())

df = pd.DataFrame(values_dict)

In [123]:
for i, cluster in enumerate(clustered_sentences):
    df[f"cluster_{i}"] = df["sentence"].apply(lambda x: 1 if x in cluster else 0)


In [125]:
df.head()

Unnamed: 0,index,sentence,cluster_0,cluster_1,cluster_2,cluster_3
0,0,"Once upon a time, in a faraway kingdom, there ...",0,1,0,0
1,1,"He was known for his bravery and kindness, tra...",0,0,1,0
2,2,"But little did he know, his life was about to ...",0,0,1,0
3,3,As Alexander rode through the forest on his tr...,0,1,0,0
4,4,"Curiosity piqued, he dismounted and approached...",0,1,0,0
