# Ноутбук для тестов методов кластеризации

## 1) Метод на основе CountVectorizer

### Суть: векторизуем тесты обращения с помощью стандартного метода CountVectorizer из библиотеки sklearn с учетом би-грамм и три-грамм, вручную находим неободимые сочитания слов и соответствующие им индексы в векторе, разбиваем на наибольшее число кластеров, а затем пишем функцию которая автоматически векторизует входной текст и по наличию или отсутствию выбранных индексов соотносит текст к кластерам


### пример

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer(ngram_range=(1, 3)) # создаем объект векторайзера

X = vectorizer.fit_transform(corpus) # обучаем его на тексте

# Смотрим на индексы комбинаций слов
features = vectorizer.get_feature_names_out()
for i, f in enumerate(features):
    print(f"Index {i}: word or n-gramm: {f}")
print(X.shape, end="\n\n")

test = vectorizer.transform(['Is this the second document?'])
print("Text: Is this the second document?")
print(f"Raw vectorizer output: {test}")
print(f"Raw vectorizer output type: {type(test)}")
print(f"BOW: {test[0].toarray()[0]}")


Index 0: word or n-gramm: and
Index 1: word or n-gramm: and this
Index 2: word or n-gramm: and this is
Index 3: word or n-gramm: document
Index 4: word or n-gramm: document is
Index 5: word or n-gramm: document is the
Index 6: word or n-gramm: first
Index 7: word or n-gramm: first document
Index 8: word or n-gramm: is
Index 9: word or n-gramm: is the
Index 10: word or n-gramm: is the first
Index 11: word or n-gramm: is the second
Index 12: word or n-gramm: is the third
Index 13: word or n-gramm: is this
Index 14: word or n-gramm: is this the
Index 15: word or n-gramm: one
Index 16: word or n-gramm: second
Index 17: word or n-gramm: second document
Index 18: word or n-gramm: the
Index 19: word or n-gramm: the first
Index 20: word or n-gramm: the first document
Index 21: word or n-gramm: the second
Index 22: word or n-gramm: the second document
Index 23: word or n-gramm: the third
Index 24: word or n-gramm: the third one
Index 25: word or n-gramm: third
Index 26: word or n-gramm: third o

In [40]:
# save and restore trained vectorizer
import joblib

# Save
joblib.dump(vectorizer, "../data/testCountVect.joblib")

# load
loadedVect = joblib.load("../data/testCountVect.joblib")

#test
test = loadedVect.transform(['Is this the second document?'])
print(f"BOW: {test.toarray()}")

BOW: [[0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0]]


In [47]:
 # пример итоговой функции кластеризации по методу 1
def clusterisation(texts: list, pathToSavedVectorizer: str):
    loadedVect = joblib.load(pathToSavedVectorizer)
    
    bows = loadedVect.transform(texts)
    
    conditions = [
        lambda x: 1 if x[6] == 1 else 0, # cluster with word 'first'
        lambda x: 1 if x[16] == 1 else 0, #cluster with word 'second'
        lambda x: 1 if x[25] == 1 else 0, #cluster with word 'third'
        lambda x: 1 if not ((x[6] == 1) or (x[16] == 1) or (x[25] == 1)) else 0 # cluster "Not detect"
    ]
    
    multiLabel = []
    
    for f in range(bows.shape[0]):
        vect = bows[f].toarray()[0]
        labels = [condition(vect) for condition in conditions]
        multiLabel += [labels]
    return np.array(multiLabel)
    

In [48]:
texts = ["the is first document", "second document", "first and second document", "document"]

labels = clusterisation(texts, "testCountVect.joblib")
print(labels) 

[[1 0 0 0]
 [0 1 0 0]
 [1 1 0 0]
 [0 0 0 1]]
