In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Образец текста как модели
sentences = ["Томилось небо так светло.",
             "Легко, легко, легко темнея.",
             "Звезда зажглась, дрожа и мрея.",
             "Томилось небо так светло.",
             "Звезда мерцала так тепло.",
             "Как над улыбкой вод лилея.",
             "Томилось небо так светло.",
             "Легко, легко, легко темнея."]

sentences

['Томилось небо так светло.',
 'Легко, легко, легко темнея.',
 'Звезда зажглась, дрожа и мрея.',
 'Томилось небо так светло.',
 'Звезда мерцала так тепло.',
 'Как над улыбкой вод лилея.',
 'Томилось небо так светло.',
 'Легко, легко, легко темнея.']

In [7]:
# Создание модели векторизации текста и ее обучение
vectorizer = TfidfVectorizer()
vectorizer.fit(sentences)

# Создание IDF для словаря слов
vocabulary, idf = vectorizer.vocabulary_, vectorizer.idf_
print("Vocabulary:", vocabulary)
print()
print("IDF:", idf)

Vocabulary: {'томилось': 15, 'небо': 10, 'так': 12, 'светло': 11, 'легко': 5, 'темнея': 13, 'звезда': 3, 'зажглась': 2, 'дрожа': 1, 'мрея': 8, 'мерцала': 7, 'тепло': 14, 'как': 4, 'над': 9, 'улыбкой': 16, 'вод': 0, 'лилея': 6}

IDF: [2.5040774  2.5040774  2.5040774  2.09861229 2.5040774  2.09861229
 2.5040774  2.5040774  2.5040774  2.5040774  1.81093022 1.81093022
 1.58778666 2.09861229 2.5040774  1.81093022 2.5040774 ]


In [8]:
from pandas import DataFrame

# Применение модели обучения
vectors = vectorizer.fit_transform(sentences)
# Создание датафрейма с частотностью слов
features = vectorizer.get_feature_names()
print("Feature names:", features)
list_of_dense = vectors.todense().tolist()

df = DataFrame(list_of_dense, columns=features)
df

Feature names: ['вод', 'дрожа', 'зажглась', 'звезда', 'как', 'легко', 'лилея', 'мерцала', 'мрея', 'над', 'небо', 'светло', 'так', 'темнея', 'тепло', 'томилось', 'улыбкой']




Unnamed: 0,вод,дрожа,зажглась,звезда,как,легко,лилея,мерцала,мрея,над,небо,светло,так,темнея,тепло,томилось,улыбкой
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.515112,0.515112,0.45164,0.0,0.0,0.515112,0.0
1,0.0,0.0,0.0,0.0,0.0,0.948683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316228,0.0,0.0,0.0
2,0.0,0.519708,0.519708,0.435556,0.0,0.0,0.0,0.0,0.519708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.515112,0.515112,0.45164,0.0,0.0,0.515112,0.0
4,0.0,0.0,0.0,0.475656,0.0,0.0,0.0,0.567556,0.0,0.0,0.0,0.0,0.359876,0.0,0.567556,0.0,0.0
5,0.447214,0.0,0.0,0.0,0.447214,0.0,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.447214
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.515112,0.515112,0.45164,0.0,0.0,0.515112,0.0
7,0.0,0.0,0.0,0.0,0.0,0.948683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316228,0.0,0.0,0.0


In [9]:
# Создание модели n-грамм (биграмм)
n_gram_vectorizer = TfidfVectorizer(ngram_range=(2, 2))

# токенизация текста и построение словаря
vectors = n_gram_vectorizer.fit_transform(sentences)
feature_names = n_gram_vectorizer.get_feature_names()
dense_list = vectors.todense().tolist()

df_1 = DataFrame(dense_list, columns=feature_names)
df_1



Unnamed: 0,вод лилея,дрожа мрея,зажглась дрожа,звезда зажглась,звезда мерцала,как над,легко легко,легко темнея,мерцала так,над улыбкой,небо так,так светло,так тепло,томилось небо,улыбкой вод
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.894427,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.57735,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0
4,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0
5,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.894427,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0
