<a href="https://colab.research.google.com/github/Cinnamy/DocumentsClassification/blob/main/5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Подключаем модули

In [None]:
import sklearn
import numpy as np
import nltk
import re
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.utils import simple_preprocess

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


#Загружаем и предобрабатываем данные

In [None]:
stopwords_en = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
well_separated = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=['comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.mideast'])
well_separated_full = []
well_separated_target = [[], [], []]
well_separated_nouns = []
well_separated_nouns_and_adjs = []
for i in range(300):
  text = well_separated.data[i]
  text = re.sub(r'[^(\w\s\-)]|[\d]|[()]|-{2,}', '', text)
  text = re.sub(r'\s{2,}|[\n]', ' ', text)
  text = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text) if word not in stopwords_en]
  if text != []:
    well_separated_full.append(text)
    well_separated_target[0].append(well_separated.target[i])
  nouns = [word[0] for word in nltk.pos_tag(text) if word[1] == "NN"]
  if nouns != []:
    well_separated_nouns.append(nouns)
    well_separated_target[1].append(well_separated.target[i])
  nouns_and_adjs = [word[0] for word in nltk.pos_tag(text) if word[1] == "NN" or word[1] == "JJ"]
  if nouns_and_adjs != []:
    well_separated_nouns_and_adjs.append(nouns_and_adjs)
    well_separated_target[2].append(well_separated.target[i])

#5 задание

##Bag of Words

In [None]:
dictionary_full = corpora.Dictionary(well_separated_full)
dictionary_nouns = corpora.Dictionary(well_separated_nouns)
dictionary_nouns_and_adjs = corpora.Dictionary(well_separated_nouns_and_adjs)

bow_full = [dictionary_full.doc2bow(doc) for doc in well_separated_full]
bow_nouns = [dictionary_nouns.doc2bow(doc) for doc in well_separated_nouns]
bow_nouns_and_adjs = [dictionary_nouns_and_adjs.doc2bow(doc) for doc in well_separated_nouns_and_adjs]

##LDA

###Делаем векторизацию

In [None]:
lda_model_full = models.LdaModel(bow_full, num_topics=4, id2word=dictionary_full, passes=15)
lda_model_nouns = models.LdaModel(bow_nouns, num_topics=4, id2word=dictionary_nouns, passes=15)
lda_model_nouns_and_adjs = models.LdaModel(bow_nouns_and_adjs, num_topics=4, id2word=dictionary_nouns_and_adjs, passes=15)

In [None]:
X_full_LDA = []
for doc_bow in bow_full:
  document_topics = lda_model_full.get_document_topics(doc_bow, minimum_probability=0.0)
  document_topic_vector = [topic_prob for _, topic_prob in document_topics]
  X_full_LDA.append(document_topic_vector)

X_nouns_LDA = []
for doc_bow in bow_nouns:
  document_topics = lda_model_nouns.get_document_topics(doc_bow, minimum_probability=0.0)
  document_topic_vector = [topic_prob for _, topic_prob in document_topics]
  X_nouns_LDA.append(document_topic_vector)

X_nouns_and_adjs_LDA = []
for doc_bow in bow_nouns_and_adjs:
 document_topics = lda_model_nouns_and_adjs.get_document_topics(doc_bow, minimum_probability=0.0)
 document_topic_vector = [topic_prob for _, topic_prob in document_topics]
 X_nouns_and_adjs_LDA.append(document_topic_vector)

###Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_LDA, X_nouns_LDA, X_nouns_and_adjs_LDA]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=4, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(well_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.009418,0.75098,745.676923,0.443138
Nouns,0.054086,0.838996,1614.425892,0.23809
Nouns and adjs,0.096974,0.810963,1370.151647,0.276075


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=4, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.00685,0.671901,425.913979,0.676081,0.011834,0.688285,493.160219,0.419778,0.009984,0.730499,625.262729,0.528365,0.021479,0.267249,39.881644,0.718685
Nouns,0.053035,0.834951,1542.95312,0.248116,0.0539,0.834629,1528.754959,0.250621,0.052574,0.8309,1458.819012,0.256405,0.040505,0.0644,54.283056,1.61393
Nouns and adjs,0.095046,0.798903,1207.242773,0.32052,0.100364,0.79537,1176.996307,0.295934,0.105327,0.78056,1012.702849,0.356388,0.087438,-0.016364,44.122641,1.643303


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=4, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.011049,0.726509,642.211394,0.382163,0.014271,0.675978,444.80451,0.409936,0.027682,-0.121269,1.66058,1.319971
Nouns,0.057221,0.83614,1564.368269,0.235445,0.056141,0.835806,1561.255163,0.246606,0.040505,0.0644,54.283056,1.61393
Nouns and adjs,0.096744,0.798844,1218.530066,0.311769,0.097382,0.791056,1121.342457,0.339414,0.085435,0.486068,202.487003,0.945896


##LSI

###Делаем векторизацию

In [None]:
lsi_model_full = models.LsiModel(bow_full, id2word=dictionary_full, num_topics=4)
lsi_model_nouns = models.LsiModel(bow_nouns, id2word=dictionary_nouns, num_topics=4)
lsi_model_nouns_and_adjs = models.LsiModel(bow_nouns_and_adjs, id2word=dictionary_nouns_and_adjs, num_topics=4)

In [None]:
X_full_LSI = []
for doc_bow in bow_full:
  document_topics = lda_model_full.get_document_topics(doc_bow, minimum_probability=0.0)
  document_topic_vector = [topic_prob for _, topic_prob in document_topics]
  X_full_LSI.append(document_topic_vector)

X_nouns_LSI = []
for doc_bow in bow_nouns:
  document_topics = lda_model_nouns.get_document_topics(doc_bow, minimum_probability=0.0)
  document_topic_vector = [topic_prob for _, topic_prob in document_topics]
  X_nouns_LSI.append(document_topic_vector)

X_nouns_and_adjs_LSI = []
for doc_bow in bow_nouns_and_adjs:
 document_topics = lda_model_nouns_and_adjs.get_document_topics(doc_bow, minimum_probability=0.0)
 document_topic_vector = [topic_prob for _, topic_prob in document_topics]
 X_nouns_and_adjs_LSI.append(document_topic_vector)

###Выводим метрики

In [None]:
silhouette = []
calinski_harabasz = []
davies_bouldin = []
v_measure = []
datasets = [X_full_LSI, X_nouns_LSI, X_nouns_and_adjs_LSI]

for j in range(3):
  X = datasets[j]
  sum_silhouette = 0
  sum_calinski_harabasz = 0
  sum_davies_bouldin = 0
  sum_v_measure = 0
  for i in range(50):
    kmeans = KMeans(n_clusters=4, n_init=1).fit(X)
    sum_silhouette += metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    sum_calinski_harabasz += metrics.calinski_harabasz_score(X, kmeans.labels_)
    sum_davies_bouldin += metrics.davies_bouldin_score(X, kmeans.labels_)
    sum_v_measure += metrics.v_measure_score(well_separated_target[j], kmeans.labels_)
  v_measure.append(sum_v_measure / 50)
  silhouette.append(sum_silhouette / 50)
  calinski_harabasz.append(sum_calinski_harabasz / 50)
  davies_bouldin.append(sum_davies_bouldin / 50)

kmeans_metrics = pd.DataFrame({
    "V_measure": v_measure,
    "Silhouette": silhouette,
    "Calinski-Harabasz": calinski_harabasz,
    "Davies-Bouldin": davies_bouldin
}, index=pd.Index(["Full", "Nouns", "Nouns and adjs"]))

kmeans_metrics.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("KMeans")

Unnamed: 0,V_measure,Silhouette,Calinski-Harabasz,Davies-Bouldin
Full,0.0095,0.748056,734.852419,0.451009
Nouns,0.054086,0.838998,1614.614142,0.238088
Nouns and adjs,0.096974,0.810926,1370.266178,0.276074


In [None]:
aggl_euc_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["ward", "complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=4, linkage=link).fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_euc_metrics[i].append(sum_v_measure / 50)
    aggl_euc_metrics[i].append(sum_silhouette / 50)
    aggl_euc_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_euc_metrics[i].append(sum_davies_bouldin / 50)

aggl_euc_metrics_table = pd.DataFrame(aggl_euc_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["ward", "complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_euc_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, Euclidean")

Metric,ward,ward,ward,ward,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.008196,0.696252,511.446715,0.517872,0.011834,0.68828,493.175796,0.419726,0.009984,0.730508,625.341221,0.528303,0.021479,0.267259,39.882657,0.718678
Nouns,0.053035,0.83494,1542.901355,0.248136,0.0539,0.834615,1528.586974,0.250644,0.052574,0.830896,1458.885492,0.256422,0.040505,0.064454,54.282405,1.614622
Nouns and adjs,0.095046,0.798858,1207.219601,0.32054,0.100364,0.795331,1177.046335,0.29594,0.105327,0.780506,1012.642995,0.356407,0.087438,-0.016364,44.131398,1.643579


In [None]:
aggl_cos_metrics = [[], [], []]

for i in range(3):
  X = datasets[i]
  for link in ["complete", "average", "single"]:
    sum_silhouette = 0
    sum_calinski_harabasz = 0
    sum_davies_bouldin = 0
    sum_v_measure = 0
    for j in range(50):
      aggl = AgglomerativeClustering(n_clusters=4, linkage=link, metric='cosine').fit(X)
      sum_silhouette += metrics.silhouette_score(X, aggl.labels_, metric='euclidean')
      sum_calinski_harabasz += metrics.calinski_harabasz_score(X, aggl.labels_)
      sum_davies_bouldin += metrics.davies_bouldin_score(X, aggl.labels_)
      sum_v_measure += metrics.v_measure_score(well_separated_target[i], aggl.labels_)
    aggl_cos_metrics[i].append(sum_v_measure / 50)
    aggl_cos_metrics[i].append(sum_silhouette / 50)
    aggl_cos_metrics[i].append(sum_calinski_harabasz / 50)
    aggl_cos_metrics[i].append(sum_davies_bouldin / 50)

aggl_cos_metrics_table = pd.DataFrame(aggl_cos_metrics,
                                index=pd.Index(["Full", "Nouns", "Nouns and adjs"]),
                                columns=pd.MultiIndex.from_product([["complete", "average", "single"],['V', 'Sil', 'Cal-Har', 'Dav-Boul']], names=['Metric', 'Linkage']))

aggl_cos_metrics_table.style.set_table_styles([{'selector': 'td', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'th', 'props': [('font-size', '12pt'),('border-style','solid'),('border-width','1px')]},
                                          {'selector': 'caption', 'props': [('font-size', '24pt')]}]).set_caption("Agglomerative clustering, cosine")

Metric,complete,complete,complete,complete,average,average,average,average,single,single,single,single
Linkage,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul,V,Sil,Cal-Har,Dav-Boul
Full,0.011049,0.726501,642.228565,0.382116,0.014271,0.675952,444.769712,0.409984,0.027682,-0.12139,1.660515,1.320716
Nouns,0.057221,0.836136,1564.437046,0.235468,0.056141,0.835805,1561.373481,0.246615,0.040505,0.064454,54.282405,1.614622
Nouns and adjs,0.096744,0.798796,1218.458744,0.311787,0.097382,0.791007,1121.300663,0.339441,0.085435,0.486103,202.574197,0.945847
