<a href="https://colab.research.google.com/github/Cinnamy/DocumentsClassification/blob/main/3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Подключаем модули

In [None]:
import sklearn
import numpy as np
import nltk
import re
import pandas as pd
import torch

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


#Загружаем и предобрабатываем данные

In [None]:
stopwords_en = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

Датасет с хорошо разделёнными тематиками:

In [None]:
well_separated = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=['comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.mideast'])
well_separated_full = []
well_separated_target = [[], [], []]
well_separated_nouns = []
well_separated_nouns_and_adjs = []
for i in range(300):
  text = well_separated.data[i]
  text = re.sub(r'[^(\w\s\-)]|[\d]|[()]|-{2,}', '', text)
  text = re.sub(r'\s{2,}|[\n]', ' ', text)
  text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text) if word not in stopwords_en])
  if text != '':
    well_separated_full.append(text)
    well_separated_target[0].append(well_separated.target[i])
  nouns = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN"])
  if nouns != '':
    well_separated_nouns.append(nouns)
    well_separated_target[1].append(well_separated.target[i])
  nouns_and_adjs = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN" or word[1] == "JJ"])
  if nouns_and_adjs != '':
    well_separated_nouns_and_adjs.append(nouns_and_adjs)
    well_separated_target[2].append(well_separated.target[i])

Датасет с плохо разделёнными тематиками:

In [None]:
weak_separated = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc'])
weak_separated_full = []
weak_separated_target = [[], [], []]
weak_separated_nouns = []
weak_separated_nouns_and_adjs = []
for i in range(300):
  text = weak_separated.data[i]
  text = re.sub(r'[^(\w\s\-)]|[\d]|-{2,}', '', text)
  text = re.sub(r'\s{2,}|[\n]', ' ', text)
  text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text) if word not in stopwords_en])
  if text != '':
    weak_separated_full.append(text)
    weak_separated_target[0].append(weak_separated.target[i])
  nouns = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN"])
  if nouns != '':
    weak_separated_nouns.append(nouns)
    weak_separated_target[1].append(weak_separated.target[i])
  nouns_and_adjs = ' '.join([word[0] for word in nltk.pos_tag(word_tokenize(text)) if word[1] == "NN" or word[1] == "JJ"])
  if nouns_and_adjs != '':
    weak_separated_nouns_and_adjs.append(nouns_and_adjs)
    weak_separated_target[2].append(weak_separated.target[i])

#3 задание

##Хорошо разделённые тематики

In [None]:
vectorizer = TfidfVectorizer()
X_full = vectorizer.fit_transform(well_separated_full)
X_nouns = vectorizer.fit_transform(well_separated_nouns)
X_nouns_and_adjs = vectorizer.fit_transform(well_separated_nouns_and_adjs)

###KMeans

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full, X_nouns, X_nouns_and_adjs]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X.toarray(), kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X.toarray(), kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(well_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)



In [None]:
kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, well separated")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.16205,0.002448,1.695053,10.597641
Nouns,0.166836,0.002396,1.790666,10.325917
Nouns and adjs,0.216322,0.003357,1.936155,10.420666


###Agglomerative, Euclidean

In [None]:
aggl_euc_metrics = [[], [], []]

for X in datasets:
  i = datasets.index(X)
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X.toarray())
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X.toarray(), aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X.toarray(), aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, well separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.163598,0.001746,2.35315,4.236398,0.124409,0.001299,1.62104,11.915837,0.012396,0.008243,1.033547,0.980244,0.012396,0.007158,1.031269,0.981689
Nouns,0.085983,0.001231,2.185313,3.127392,0.016099,0.000227,1.295397,10.921508,0.013712,0.005454,1.021989,0.985778,0.013712,0.004358,1.019779,0.987204
Nouns and adjs,0.138949,0.002949,2.664564,3.581609,0.08257,0.001399,1.615935,11.139637,0.013712,0.005788,1.023329,0.985124,0.013712,0.004029,1.01976,0.987427


###Agglomerative, cosine

In [None]:
aggl_cos_metrics = [[], [], []]

datasets = [X_full, X_nouns, X_nouns_and_adjs]
for X in datasets:
  i = datasets.index(X)
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X.toarray())
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X.toarray(), aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X.toarray(), aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, well separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.038878,-0.000699,1.172793,4.187604,0.012396,0.008243,1.033547,0.980244,0.012396,0.007158,1.031269,0.981689
Nouns,0.13517,6.3e-05,1.579574,5.338903,0.013712,0.005454,1.021989,0.985778,0.013712,0.004358,1.019779,0.987204
Nouns and adjs,0.02,0.001259,1.162173,2.871755,0.013712,0.005788,1.023329,0.985124,0.013712,0.004029,1.01976,0.987427


##Плохо разделённые тематики

In [None]:
vectorizer = TfidfVectorizer()
X_full = vectorizer.fit_transform(weak_separated_full)
X_nouns = vectorizer.fit_transform(weak_separated_nouns)
X_nouns_and_adjs = vectorizer.fit_transform(weak_separated_nouns_and_adjs)

###KMeans

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full, X_nouns, X_nouns_and_adjs]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=3).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X.toarray(), kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X.toarray(), kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(weak_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)



In [None]:
kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans, weak separated")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.080921,0.002618,1.791327,10.330237
Nouns,0.02929,0.00134,1.656025,10.508624
Nouns and adjs,0.137873,0.003782,2.230398,9.549711


###Agglomerative, Euclidean

In [None]:
aggl_euc_metrics = [[], [], []]

for X in datasets:
  i = datasets.index(X)
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X.toarray())
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X.toarray(), aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X.toarray(), aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean, weak separated")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.191361,0.000289,2.626318,5.752207,0.034348,0.003324,1.46384,9.477136,0.010899,0.006929,1.217078,3.13996,0.013746,0.004022,1.025744,0.984608
Nouns,0.112773,0.002637,2.021913,7.50451,0.009048,0.000867,1.508047,11.44,0.006292,0.007047,1.156322,2.775121,0.012274,0.004303,1.02359,0.985937
Nouns and adjs,0.202842,0.003468,2.980201,5.532816,0.021357,5.5e-05,1.60632,9.978927,0.024165,0.005665,1.116973,2.267708,0.013803,0.003511,1.022193,0.986683


###Agglomerative, cosine

In [None]:
aggl_cos_metrics = [[], [], []]

datasets = [X_full, X_nouns, X_nouns_and_adjs]
for X in datasets:
  i = datasets.index(X)
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X.toarray())
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X.toarray(), aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X.toarray(), aggl.labels_)
      sum_v_measure += metrics.v_measure_score(weak_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine, weak separated")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.022965,0.000195,1.156828,3.548974,0.006693,0.007498,1.20357,2.923586,0.013746,0.004022,1.025744,0.984608
Nouns,0.017833,0.000387,1.273451,1.899233,0.006292,0.007047,1.156322,2.775121,0.012274,0.004303,1.02359,0.985937
Nouns and adjs,0.02217,0.002012,1.144545,2.640206,0.024165,0.005665,1.116973,2.267708,0.013803,0.003511,1.022193,0.986683
