## Установка пакетов


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/fanfurick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Методы необхожимые для работы

In [2]:
def load_data(file_path):
    total_rows = sum(1 for _ in open(file_path)) - 1
    nrows = total_rows // 128
    df = pd.read_csv(file_path, nrows=nrows)
    return df[['text']]

In [3]:
def prepare_data(df):
    df['tokens'] = df['text'].apply(word_tokenize)
    return df['tokens'].tolist()

In [4]:
def train_cbow_model(tokenized_texts):
    model = Word2Vec(sentences=tokenized_texts, vector_size=300, window=5, min_count=1, sg=0)
    return model

In [5]:
def train_skipgram_model(tokenized_texts):
    model = Word2Vec(sentences=tokenized_texts, vector_size=300, window=5, min_count=1, sg=1)
    return model

In [6]:
def compute_tfidf(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer.get_feature_names_out()

## Main функция

In [8]:
def main():
    file_path = 'texts.csv'
    df = load_data(file_path)
    tokenized_texts = prepare_data(df)
    
    cbow_model = train_cbow_model(tokenized_texts)
    
    skipgram_model = train_skipgram_model(tokenized_texts)
    
    tfidf_matrix, feature_names = compute_tfidf(df['text'].tolist())

    print("Слова в модели CBOW:", list(cbow_model.wv.index_to_key))
    
    print("Слова в модели Skip-gram:", list(skipgram_model.wv.index_to_key))
    
    print("TF-IDF матрица:", tfidf_matrix)

if __name__ == "__main__":
    main()

TF-IDF матрица:   (0, 89894)	0.02387772300749919
  (0, 89318)	0.17908525881263673
  (0, 67441)	0.028276103304746412
  (0, 19193)	0.033230805797670564
  (0, 18158)	0.3958321251401305
  (0, 94229)	0.16911337165826507
  (0, 42406)	0.012877600058637877
  (0, 34633)	0.06697398725590102
  (0, 38026)	0.02354751113809854
  (0, 55375)	0.05424951679964359
  (0, 11925)	0.031146471993821135
  (0, 64948)	0.030585074302536937
  (0, 90346)	0.10534426988978632
  (0, 43589)	0.08346358856850068
  (0, 31093)	0.09314592234186751
  (0, 64489)	0.15851295450139494
  (0, 83592)	0.08260881275874996
  (0, 90551)	0.0398026446484327
  (0, 8588)	0.02975592299035364
  (0, 33619)	0.029794432581637505
  (0, 53979)	0.04709844321828602
  (0, 8702)	0.047589887931884324
  (0, 18446)	0.3641664840056712
  (0, 10448)	0.11208295867122381
  (0, 23409)	0.053531603115163445
  :	:
  (15297, 76818)	0.034426739195751024
  (15297, 91739)	0.03887507834211001
  (15297, 82322)	0.03771984096297076
  (15297, 69872)	0.038342970193610805
