In [1]:
corpus = [
    'Это метрика, которая используется для оценки важности слова в контексте документа или коллекции документов',
    'Это метрика, используемая для оценки важности слова в документе',  
    'Это мера, которая оценивает важность слова в контексте документа относительно корпуса документов',
    'Это метод оценки важности слова в документе относительно коллекции документов',
    'Это шкала, которая используется для оценки важности слова в документе',
    'Это область знаний, которая изучает различные формы искусства',
    'Это наука, которая изучает развитие различных видов искусства в течение истории человечества',
    'Это раздел искусствоведения и всеобщей истории, который изучает процессы развития искусства'
]

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

stemmer = SnowballStemmer('russian')
russian_stopwords = stopwords.words('russian') + ['это']

def preprocess(text):
    stemmed_words = []
    for word in text.split():
        word = word.lower()
        if word not in russian_stopwords:
            stemmed_words.append(stemmer.stem(word))
    return ' '.join(stemmed_words)

vectorizer = TfidfVectorizer(
    preprocessor=preprocess,
    norm=None
)

tfidf_matrix = vectorizer.fit_transform(corpus)

result = pd.DataFrame(
    data=tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

result

Unnamed: 0,важност,вид,всеобщ,документ,знаний,изуча,искусств,искусствоведен,использ,используем,...,оценк,процесс,развит,раздел,различн,слов,течен,форм,человечеств,шкала
0,1.405465,0.0,0.0,2.81093,0.0,0.0,0.0,0.0,2.098612,0.0,...,1.587787,0.0,0.0,0.0,0.0,1.405465,0.0,0.0,0.0,0.0
1,1.405465,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,0.0,2.504077,...,1.587787,0.0,0.0,0.0,0.0,1.405465,0.0,0.0,0.0,0.0
2,1.405465,0.0,0.0,2.81093,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.405465,0.0,0.0,0.0,0.0
3,1.405465,0.0,0.0,2.81093,0.0,0.0,0.0,0.0,0.0,0.0,...,1.587787,0.0,0.0,0.0,0.0,1.405465,0.0,0.0,0.0,0.0
4,1.405465,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,2.098612,0.0,...,1.587787,0.0,0.0,0.0,0.0,1.405465,0.0,0.0,0.0,2.504077
5,0.0,0.0,0.0,0.0,2.504077,1.81093,1.81093,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.098612,0.0,0.0,2.504077,0.0,0.0
6,0.0,2.504077,0.0,0.0,0.0,1.81093,1.81093,0.0,0.0,0.0,...,0.0,0.0,2.098612,0.0,2.098612,0.0,2.504077,0.0,2.504077,0.0
7,0.0,0.0,2.504077,0.0,0.0,1.81093,1.81093,2.504077,0.0,0.0,...,0.0,2.504077,2.098612,2.504077,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
from sklearn.metrics.pairwise import cosine_distances

distances = cosine_distances(result)

pd.DataFrame(distances)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.414677,0.519732,0.402699,0.377834,0.951716,0.9611,0.959195
1,0.414677,0.0,0.717936,0.560814,0.575293,1.0,1.0,1.0
2,0.519732,0.717936,0.0,0.532402,0.675081,0.956339,0.964825,0.963102
3,0.402699,0.560814,0.532402,0.0,0.577761,1.0,1.0,1.0
4,0.377834,0.575293,0.675081,0.577761,0.0,0.938506,0.950458,0.948031
5,0.951716,1.0,0.956339,1.0,0.938506,0.0,0.677926,0.780911
6,0.9611,1.0,0.964825,1.0,0.950458,0.677926,0.0,0.727812
7,0.959195,1.0,0.963102,1.0,0.948031,0.780911,0.727812,0.0
