## Установка пакетов

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from collections import Counter

In [None]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8.0/ru_core_news_sm-3.8.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ru-core-news-sm
  Attempting uninstall: ru-core-news-sm
    Found existing installation: ru-core-news-sm 3.7.0
    Uninstalling ru-core-news-sm-3.7.0:
      Successfully uninstalled ru-core-news-sm-3.7.0
Successfully installed ru-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Инициализация глобальных переменных

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('russian'))
nlp = spacy.load("ru_core_news_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Системный метод

In [None]:
def clear_processed_chunks_folder(folder_path='second/processed_chunks'):
    if os.path.exists(folder_path):
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path):
                    os.remove(file_path)
            except Exception as e:
                print(f"Не удалось удалить файл {file_path}: {e}")

## Методы для работы с текстом

In [None]:
def get_word_counts(text_series):
    all_text = ' '.join(text_series)
    words = all_text.split()
    words = [word for word in words if word not in stop_words]
    word_counts = Counter(words)
    return word_counts

In [None]:
def search_top_words(word_counts, top_n=10):
    most_common = word_counts.most_common(top_n)
    top_words = set(word for word, _ in most_common)
    return top_words

In [None]:
def lemmatize_words(texts):
    docs = nlp.pipe(texts, disable=["parser", "ner"])
    return [' '.join([token.lemma_ for token in doc]) for doc in docs]

## Методы процессов обработки текста и тд


In [None]:
def process_chunk(chunk, chunk_index):
    chunk = chunk.assign(
        cleaned_text=chunk['text'].str.replace(r'[^\w\s]', '', regex=True).str.lower().str.strip()
    )

    word_counts = get_word_counts(chunk['cleaned_text'])
    top_words = search_top_words(word_counts, top_n=10)

    chunk['cleaned_text'] = chunk['cleaned_text'].str.split().apply(lambda words: ' '.join([word for word in words if word not in top_words]))

    chunk['tokens'] = chunk['cleaned_text'].str.split()

    chunk['lemmatized'] = lemmatize_words(chunk['cleaned_text'].tolist())

    chunk.to_csv(f'second/processed_chunks/processed_chunk_{chunk_index}.csv', index=False)

    return chunk

In [None]:
def process_text(file_path, num_chunks=20):
    clear_processed_chunks_folder()
    df = pd.read_csv(file_path)

    chunk_size = len(df) // num_chunks + (len(df) % num_chunks > 0)
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    print(f"Processing {len(chunks)} chunks")

    processed_chunks = []
    for index, chunk in enumerate(chunks):
        processed_chunk = process_chunk(chunk, index)
        processed_chunks.append(processed_chunk)

    final_df = pd.concat(processed_chunks, ignore_index=True)
    clear_processed_chunks_folder()
    return final_df

## Головной вызов методов

In [None]:
    try:
        processed_df = process_text('second/combined_texts.csv')
        processed_df.to_csv('second/processed_texts.csv', index=False)
    except Exception as e:
        print(f"Произошла ошибка: {e}")