In [1]:
from keybert import KeyBERT
import whisper_timestamped as whisper
from nltk import download
from nltk.corpus import stopwords

download("stopwords")
YT_LINK = "/content/vid test.weba"

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Hardcoded text

In [2]:
with open("content/oleg.txt", "r", encoding="utf-8") as f:
    txt = f.read()

# NLP

In [3]:
from transformers import BertModel, BertTokenizerFast
from datasets import Dataset
import torch
import torch.nn.functional as F
import pandas as pd
from transformers.pipelines import pipeline

In [4]:
russian_stopwords = set(stopwords.words("russian"))

In [5]:
model_checkpoint = 'bert-base-multilingual-cased'
hf_model = pipeline("feature-extraction", model="bert-base-multilingual-cased")

tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)
model = BertModel.from_pretrained(model_checkpoint)
model = model.eval()


def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.pooler_output
    normalized_embeddings = F.normalize(embeddings, p=2)

    return normalized_embeddings

In [6]:
from pymystem3 import mystem


def preprocess_text(text):
    m = mystem.Mystem()
    lemmas = m.lemmatize(text)
    lemmas = [l for l in lemmas if l.isalpha()]
    return " ".join(lemmas)

In [7]:
kw_model = KeyBERT(model=hf_model)
window_size = 200
stride = 50

txt = preprocess_text(txt)
txt = txt.replace("\n", " ").replace("\r", " ")
txt = txt.replace(".", "").replace(",", "")
txt = txt.lower()
tokens = txt.split()
tokens = [t for t in tokens if t not in russian_stopwords]

indexed_keywords = {}
frags = []
df = pd.DataFrame(columns=["index", "keywords", "fragments"])
for i in range(0, len(tokens) - window_size, stride):
    frag = tokens[i: i + window_size]
    frag = " ".join([f.lower() for f in frag])
    frags.append(frag)
    keywords = kw_model.extract_keywords(frag, keyphrase_ngram_range=(1, 1), top_n=10, use_mmr=True)
    indexed_keywords[i] = keywords

In [8]:
values_dict = {"index": [], "keywords": []}
for i, keywords in indexed_keywords.items():
    words = " ".join([k[0] for k in keywords])

    values_dict["index"].append(i)
    values_dict["keywords"].append(words)

In [9]:
df = pd.DataFrame(values_dict)
df["fragments"] = frags
df.head(3)

Unnamed: 0,index,keywords,fragments
0,0,приглашать торжественный оцениваться этмошник ...,очень рад представлять олег шатов который студ...
1,50,являться вдвойне кадры спецолист выделять веду...,поэтому даша оставлять олег покидать помучать ...
2,100,практиковать приглашать эксперт взаимодействие...,глаз это действительно хватать зачастую онлайн...


In [10]:
import faiss
import numpy as np

# Retrieve embeddings from the DataFrame
TARGET_COL = 'fragments'

df['embedding'] = df[TARGET_COL].apply(lambda x: get_embeddings(x).numpy()[0])
embeddings = df['embedding'].tolist()

dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

# Calculate the distance between every row and the next one
distances = []
for i in range(len(embeddings) - 1):
    distance = index.search(np.array([embeddings[i]]).astype('float32'), 2)[0][0][1]
    distances.append(distance)

# Append the distances to the DataFrame
df['distance_to_next'] = distances + [None]  # The last row


In [11]:
df.head(50)

Unnamed: 0,index,keywords,fragments,embedding,distance_to_next
0,0,приглашать торжественный оцениваться этмошник ...,очень рад представлять олег шатов который студ...,"[0.027370164, -0.01719859, 0.018644359, -0.040...",0.027479
1,50,являться вдвойне кадры спецолист выделять веду...,поэтому даша оставлять олег покидать помучать ...,"[0.027277341, -0.017641788, 0.029549293, -0.03...",0.021638
2,100,практиковать приглашать эксперт взаимодействие...,глаз это действительно хватать зачастую онлайн...,"[0.020744605, -0.011988726, 0.033831585, -0.03...",0.002658
3,150,монокорпоративный оцениваться приглашать ведущ...,исключительно научный задача задача который де...,"[0.029628243, -0.021332772, 0.02368174, -0.036...",0.015811
4,200,монокорпоративный ведущая взаимодействие итмо ...,большой ключевой звено наш комьюнити являться ...,"[0.022199735, -0.012648165, 0.03196414, -0.031...",0.00465
5,250,монокорпоративный ведущая включаться учиться п...,это большой плюс ребята включаться процесс соз...,"[0.011073448, -0.017381683, 0.035386708, -0.03...",0.02357
6,300,монокорпоративный сгруппироваться познакомитьс...,инженер компания самый приносить разный корпор...,"[0.019774305, -0.013836842, 0.03394907, -0.028...",0.004628
7,350,сгруппироваться определенный взаимодействие ди...,разный действительно опыт профиль поэтому инте...,"[0.017279206, -0.013521058, 0.029546486, -0.03...",0.020136
8,400,сгруппироваться заинтересованный познакомиться...,который исключительно академический среда полу...,"[0.013859925, -0.014138191, 0.035146367, -0.03...",0.005505
9,450,сгруппироваться услышать очевидный дисциплина ...,например откуда поступать задача типовой приме...,"[0.026072308, -0.018348016, 0.02771687, -0.045...",0.018265


In [12]:
df["distance_to_next"].mean()

0.03356079586266883

In [None]:
words_dataset = Dataset.from_pandas(df)
words_dataset = words_dataset.map(
    lambda x: {'text_embeddings': get_embeddings(x["keywords"])[0]}
)

In [None]:
words_dataset.to_pandas().head()

In [None]:
scores, samples = words_dataset.get_nearest_examples('text_embeddings', test_frag_1, k=words_dataset.shape[0])
demo_df = pd.DataFrame({"index": samples['index']})
demo_df["score_to_1"] = scores
demo_df["keywords"] = demo_df["index"].apply(lambda x: [w[0] for w in indexed_keywords[x]])

In [None]:
df

In [None]:
demo_df["score/dist"] = demo_df["score_to_1"] / demo_df["index"]

In [36]:
demo_df

Unnamed: 0,index,score_to_1,keywords,score/dist
0,50,0.0,"[обучаться, практиковать, собеседник, приглаша...",0.0
1,1600,0.041696,"[заинтересовывать, встречаться, заинтересованн...",2.6e-05
2,1650,0.041696,"[заинтересовывать, встречаться, заинтересованн...",2.5e-05
3,100,0.044245,"[практиковать, взаимодействовать, обучаться, р...",0.000442
4,1550,0.046017,"[встречаться, заинтересовывать, увлекать, заин...",3e-05
5,1500,0.05199,"[поработать, заинтересованность, увлекать, инт...",3.5e-05
6,1400,0.053479,"[интердисциплинарный, поработать, сформировыва...",3.8e-05
7,200,0.054806,"[монокорпоративный, ведущая, преподаватель, пр...",0.000274
8,900,0.054976,"[являться, заинтересовывать, пользователь, пос...",6.1e-05
9,1050,0.05551,"[заинтересовывать, проговаривать, встречаться,...",5.3e-05


## Sentence transformers

In [None]:
with open("content/oleg.txt", "r", encoding="utf-8") as f:
    txt = f.read()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

sentences = txt.split(".")
embeddings = model.encode(sentences)

In [None]:
def get_sentence_embeddings(sentence):
    embeddings = model.encode(sentence)
    return embeddings

In [None]:
values_dict = {"index":[], "sentence": [], "embedding": []}
for i, sentence in enumerate(sentences):
    values_dict["index"].append(i)
    values_dict["sentence"].append(sentence.strip())
    values_dict["embedding"].append(get_sentence_embeddings(sentence))


In [None]:
df = pd.DataFrame(values_dict)
df.head(5)

In [None]:
sentence_dataset = Dataset.from_pandas(df)
sentence_dataset = sentence_dataset.add_faiss_index(column='embedding')

In [None]:
test_frag_1 = sentence_dataset.to_pandas().head()['embedding'][1]
sentence_dataset.to_pandas().head()

In [None]:
scores, samples = sentence_dataset.get_nearest_examples('embedding', test_frag_1, k=len(sentence_dataset))
demo_df = pd.DataFrame({"index": samples['index'], "fragment": values_dict["sentence"]})
demo_df["score_to_1"] = scores / scores.mean()

In [None]:
demo_df

## Sentence transformers

In [None]:
with open("content/test_text.txt", "r", encoding="utf-8") as f:
    txt = f.read()

In [None]:
from sklearn.cluster import KMeans, HDBSCAN
from sentence_transformers import SentenceTransformer

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

sentences = txt.split(".")
sentences = [s.strip() for s in sentences if len(s) > 0]
embeddings = model.encode(sentences)

In [None]:
num_clusters = 4
clustering_model = HDBSCAN(min_cluster_size=num_clusters, max_cluster_size=4)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentences[sentence_id])


In [None]:
values_dict = {"index":[], "sentence": []}
for i, sentence in enumerate(sentences):
    values_dict["index"].append(i)
    values_dict["sentence"].append(sentence.strip())

df = pd.DataFrame(values_dict)

In [None]:
for i, cluster in enumerate(clustered_sentences):
    df[f"cluster_{i}"] = df["sentence"].apply(lambda x: 1 if x in cluster else 0)


In [None]:
df