<a href="https://colab.research.google.com/github/Cinnamy/DocumentsClassification/blob/main/4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Подключаем модули

In [None]:
import sklearn
import numpy as np
import nltk
import re
import pandas as pd
import torch

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, DistilBertTokenizer, DistilBertModel

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#Загружаем и предобрабатываем данные

In [None]:
stopwords_en = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

Датасет с хорошо разделёнными тематиками:

In [None]:
well_separated = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=['comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.mideast'])
well_separated_full = []
well_separated_target = [[], [], []]
well_separated_nouns = []
well_separated_nouns_and_adjs = []
for i in range(300):
  text = well_separated.data[i]
  text = re.sub(r'[^(\w\s\-)]|[\d]|[()]|-{2,}', '', text)
  text = re.sub(r'\s{2,}|[\n]', ' ', text)
  text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text) if word not in stopwords_en])
  if text != '':
    well_separated_full.append(text)
    well_separated_target[0].append(well_separated.target[i])
  nouns = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN"])
  if nouns != '':
    well_separated_nouns.append(nouns)
    well_separated_target[1].append(well_separated.target[i])
  nouns_and_adjs = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN" or word[1] == "JJ"])
  if nouns_and_adjs != '':
    well_separated_nouns_and_adjs.append(nouns_and_adjs)
    well_separated_target[2].append(well_separated.target[i])

Датасет с плохо разделёнными тематиками:

In [None]:
weak_separated = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc'])
weak_separated_full = []
weak_separated_target = [[], [], []]
weak_separated_nouns = []
weak_separated_nouns_and_adjs = []
for i in range(300):
  text = weak_separated.data[i]
  text = re.sub(r'[^(\w\s\-)]|[\d]|-{2,}', '', text)
  text = re.sub(r'\s{2,}|[\n]', ' ', text)
  text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text) if word not in stopwords_en])
  if text != '':
    weak_separated_full.append(text)
    weak_separated_target[0].append(weak_separated.target[i])
  nouns = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN"])
  if nouns != '':
    weak_separated_nouns.append(nouns)
    weak_separated_target[1].append(weak_separated.target[i])
  nouns_and_adjs = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN" or word[1] == "JJ"])
  if nouns_and_adjs != '':
    weak_separated_nouns_and_adjs.append(nouns_and_adjs)
    weak_separated_target[2].append(weak_separated.target[i])

#4 задание

##BERT

###Хорошо разделённые тематики

####Делаем векторизацию

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [None]:
well_separated_full_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_full]
well_separated_nouns_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_nouns]
well_separated_nouns_adjs_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_nouns_and_adjs]

X_full_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_full_tokens:
    outputs = model(**phrase_tokens)
    X_full_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_nouns_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_and_adjs_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_nouns_adjs_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_and_adjs_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

####Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_well, X_nouns_well, X_nouns_and_adjs_well]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(well_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, well separated")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.247136,0.061353,21.453229,3.070557
Nouns,0.19232,0.067065,18.716815,3.217681
Nouns and adjs,0.266683,0.0645,19.926451,3.092692


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, well separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.287907,0.038028,18.488886,2.968934,0.033168,0.082761,13.543703,2.339179,0.012396,0.285816,3.83155,0.530792,0.012396,0.271609,3.733544,0.543868
Nouns,0.210952,0.114938,16.901124,2.634632,0.113298,0.123248,12.412386,2.322584,0.013818,0.234478,5.836692,0.993792,0.012536,0.204796,2.359479,0.639421
Nouns and adjs,0.192814,0.061433,17.297244,2.380739,0.012444,0.080107,10.157908,2.498224,0.024813,0.210602,7.044077,1.76069,0.013264,0.246,2.654116,0.6024


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, well separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.020669,0.224118,6.270458,1.629805,0.013548,0.258013,3.666542,0.553852,0.012396,0.271609,3.733544,0.543868
Nouns,0.015728,0.051722,14.681247,2.138396,0.006858,0.154863,8.580985,1.603221,0.013264,0.156612,1.89537,0.717735
Nouns and adjs,0.037671,0.206118,6.569171,1.838261,0.025076,0.238491,4.755612,1.346285,0.014124,0.196078,2.365559,0.655773


###Плохо разделённые тематики

####Делаем векторизацию

In [None]:
weak_separated_full_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_full]
weak_separated_nouns_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_nouns]
weak_separated_nouns_adjs_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_nouns_and_adjs]

X_full_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_full_tokens:
    outputs = model(**phrase_tokens)
    X_full_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_nouns_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_and_adjs_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_nouns_adjs_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_and_adjs_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

####Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_weak, X_nouns_weak, X_nouns_and_adjs_weak]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(weak_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, weak separated")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.04672,0.063066,23.509752,2.931805
Nouns,0.032603,0.064984,23.770536,3.015925
Nouns and adjs,0.053694,0.067649,24.625041,2.635518


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, weak separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.0894,0.06041,20.931864,2.835211,0.002765,0.250095,8.868868,2.234374,0.007218,0.310258,8.133661,1.413722,0.014155,0.301467,3.409223,0.53751
Nouns,0.072657,0.027231,20.426752,2.842755,0.008945,0.27333,13.585758,1.336123,0.011643,0.288713,3.707144,1.055427,0.013948,0.301255,3.289511,0.537467
Nouns and adjs,0.053239,0.053479,21.712891,2.089478,0.089323,0.059828,24.355854,2.21662,0.012867,0.240742,2.561103,0.610718,0.012867,0.225287,2.403991,0.631635


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, weak separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.001566,0.090695,12.194661,3.505121,0.00957,0.314346,6.642195,1.229732,0.013746,0.350455,3.772466,0.49938
Nouns,0.009835,0.217308,15.437492,2.574859,0.006352,0.268251,12.726769,1.218232,0.014246,0.256867,2.941278,0.583272
Nouns and adjs,0.018319,0.146329,17.240633,2.458554,0.006583,0.229127,15.193251,1.736,0.012867,0.155742,2.138488,0.69045


##RoBERTa

###Хорошо разделённые

####Делаем векторизацию

In [None]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
well_separated_full_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_full]
well_separated_nouns_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_nouns]
well_separated_nouns_adjs_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_nouns_and_adjs]

X_full_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_full_tokens:
    outputs = model(**phrase_tokens)
    X_full_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_nouns_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_and_adjs_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_nouns_adjs_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_and_adjs_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

####Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_well, X_nouns_well, X_nouns_and_adjs_well]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(well_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, well separated")



Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.062791,0.125175,69.642895,2.258108
Nouns,0.016397,0.13672,54.802881,1.973646
Nouns and adjs,0.054631,0.11951,55.163932,2.254793


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, well separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.039681,0.184472,65.04058,1.731216,0.044958,0.195206,43.506908,1.586287,0.025758,0.295148,34.486372,0.945874,0.012396,0.229277,2.883834,0.598856
Nouns,0.063196,0.13281,50.317953,2.059782,0.027537,0.205036,39.165887,1.411206,0.021529,0.290421,29.121846,0.98484,0.012536,0.22966,2.97347,0.579321
Nouns and adjs,0.130881,0.098147,50.029804,2.516463,0.018403,0.238756,35.572589,1.269654,0.015613,0.237825,35.687975,1.280899,0.013712,0.152571,2.084991,0.673302


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, well separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.043104,0.145074,60.892883,2.052876,0.035967,0.304551,54.389742,1.027524,0.012396,0.198926,2.89913,0.59712
Nouns,0.025371,0.207371,40.739193,1.428614,0.018967,0.289841,29.088982,1.008944,0.012536,0.22966,2.97347,0.579321
Nouns and adjs,0.063259,0.236024,37.969002,1.32212,0.015613,0.237825,35.687975,1.280899,0.012536,0.166977,2.56422,0.636425


###Плохо разделённые

####Делаем векторизацию

In [None]:
weak_separated_full_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_full]
weak_separated_nouns_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_nouns]
weak_separated_nouns_adjs_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_nouns_and_adjs]

X_full_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_full_tokens:
    outputs = model(**phrase_tokens)
    X_full_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_nouns_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_and_adjs_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_nouns_adjs_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_and_adjs_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

####Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_weak, X_nouns_weak, X_nouns_and_adjs_weak]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(weak_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, weak separated")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.049485,0.159869,90.770572,1.919464
Nouns,0.046845,0.167849,71.555307,1.744215
Nouns and adjs,0.061668,0.167758,81.826118,1.818351


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, weak separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.045629,0.140681,83.812075,1.959175,0.054387,0.23644,74.198965,1.528806,0.043888,0.318723,66.760486,0.977743,0.015576,0.219905,2.748535,0.587326
Nouns,0.049313,0.162293,70.105783,1.775887,0.044287,0.151937,61.846122,1.766516,0.013371,0.233352,35.167297,1.173314,0.012513,0.077765,1.659233,0.754325
Nouns and adjs,0.062725,0.159678,78.461635,1.836918,0.070802,0.255318,73.974145,1.362717,0.006012,0.249325,11.200894,0.92031,0.012867,0.206742,2.628006,0.597025


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, weak separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.032939,0.219086,71.459663,1.648562,0.055574,0.32814,65.781956,0.948083,0.015576,0.219905,2.748535,0.587326
Nouns,0.054249,0.191925,62.221498,1.585865,0.048193,0.250068,46.380764,1.180863,0.012274,0.073111,1.655702,0.755082
Nouns and adjs,0.065732,0.162405,74.087367,1.904038,0.083353,0.293788,64.957514,1.158352,0.012453,0.165287,2.347093,0.627951


##DistilBERT

###Хорошо разделённые тематики

####Делаем векторизацию

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
well_separated_full_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_full]
well_separated_nouns_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_nouns]
well_separated_nouns_adjs_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in well_separated_nouns_and_adjs]

X_full_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_full_tokens:
    outputs = model(**phrase_tokens)
    X_full_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_nouns_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_and_adjs_well = []
with torch.no_grad():
  for phrase_tokens in well_separated_nouns_adjs_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_and_adjs_well.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

####Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_well, X_nouns_well, X_nouns_and_adjs_well]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(well_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, well separated")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.134095,0.102595,33.360894,2.660068
Nouns,0.122737,0.083129,26.866516,2.732706
Nouns and adjs,0.221745,0.086243,28.525019,2.846022


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, well separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.120673,0.067915,29.247231,3.09685,0.042059,0.097741,22.368832,1.812486,0.012396,0.287246,4.163835,0.519149,0.013548,0.24184,3.899022,0.557587
Nouns,0.022333,0.074808,24.785455,2.445053,0.014126,0.148696,23.678414,1.681527,0.013818,0.267915,8.094917,0.893094,0.018573,0.268573,7.525078,0.765658
Nouns and adjs,0.277325,0.08598,27.682545,2.740531,0.02393,0.164019,16.907004,1.332521,0.020974,0.218379,4.889799,0.791648,0.013712,0.218212,3.706454,0.572978


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, well separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.025558,0.238175,7.094429,1.789613,0.022882,0.245412,6.565457,1.634578,0.012396,0.262578,4.013781,0.539391
Nouns,0.032744,0.086948,25.419494,2.249514,0.040461,0.194193,14.725393,1.827156,0.013712,0.120437,2.052509,0.723914
Nouns and adjs,0.005836,0.040424,9.727449,2.512699,0.023032,0.170096,7.120083,1.558173,0.021753,0.154052,4.593829,0.823747


###Плохо разделённые тематики

####Делаем векторизацию

In [None]:
weak_separated_full_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_full]
weak_separated_nouns_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_nouns]
weak_separated_nouns_adjs_tokens = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in weak_separated_nouns_and_adjs]

X_full_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_full_tokens:
    outputs = model(**phrase_tokens)
    X_full_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_nouns_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

X_nouns_and_adjs_weak = []
with torch.no_grad():
  for phrase_tokens in weak_separated_nouns_adjs_tokens:
    outputs = model(**phrase_tokens)
    X_nouns_and_adjs_weak.append(np.array(outputs.last_hidden_state.mean(dim=1).flatten()))

####Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_weak, X_nouns_weak, X_nouns_and_adjs_weak]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(weak_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, weak separated")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.041329,0.127267,40.58471,2.446486
Nouns,0.053887,0.121227,40.811004,1.973898
Nouns and adjs,0.042323,0.138006,43.66118,1.844568


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, weak separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.068237,0.143816,37.616277,1.922882,0.027323,0.123176,34.92764,2.026378,0.012928,0.283719,4.33245,0.912973,0.012928,0.283719,4.33245,0.912973
Nouns,0.081034,0.121899,39.297021,1.945657,0.053165,0.128069,35.626465,1.620649,0.006084,0.231015,15.095973,0.98659,0.014246,0.196961,2.629472,0.631712
Nouns and adjs,0.076578,0.139377,40.987696,1.915494,0.003319,0.101165,35.787083,2.310798,0.013661,0.230513,14.469371,0.96484,0.014155,0.200266,2.777454,0.617401


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, weak separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.030732,0.122599,33.053984,1.933882,0.009327,0.268289,11.425187,1.435139,0.012928,0.340198,5.066864,0.815644
Nouns,0.003964,0.092216,29.263178,2.7218,0.008095,0.3095,18.862751,1.133293,0.014246,0.196961,2.629472,0.631712
Nouns and adjs,0.099677,0.136706,40.92181,1.738892,0.007273,0.2162,20.184184,1.899616,0.012158,0.164437,2.150485,0.683092
