# –ü–æ–ø—ã—Ç–∫–∞ —Å–¥–µ–ª–∞—Ç—å –∫–ª–∞—Å—Å

In [59]:
import os
import pandas as pd
from tqdm import tqdm
import torch
import pyarrow.parquet as pq
import dask.dataframe as dd
import spacy
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast, BertForSequenceClassification, BertConfig
from sklearn.cluster import DBSCAN
import numpy as np
from collections import Counter
from torch.utils.data import DataLoader, Dataset as TorchDataset
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import hdbscan
from scipy.spatial.distance import pdist, squareform
import logging
import re
from joblib import Parallel, delayed


class ReviewsKeywords:
    def __init__(self, csv_path, model_path, spacy_model="ru_core_news_lg"):
        self.csv_path = csv_path
        self.model_path = model_path

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self.device == "cuda":
            import cudf.pandas  # –ò–º–ø–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ cuDF –∏ –∞–∫—Ç–∏–≤–∞—Ü–∏—è –µ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
            cudf.pandas.install()
        os.environ["TOKENIZERS_PARALLELISM"] = "true"  # –í–∫–ª—é—á–∞–µ–º –ø–∞—Ä–∞–ª–ª–µ–ª–∏–∑–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
        self.tokenizer_my = BertTokenizerFast.from_pretrained(self.model_path)
         # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏
        self.classification_model = BertForSequenceClassification.from_pretrained(self.model_path).to(self.device)
        # –ó–∞–≥—Ä—É–∑–∫–∞ –±–∞–∑–æ–≤–æ–π –º–æ–¥–µ–ª–∏ –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        self.embedding_model = AutoModel.from_pretrained(self.model_path).to(self.device)
        
        # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞ –æ—Ç –°–±–µ—Ä–±–∞–Ω–∫–∞
        self.tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/sbert_large_nlu_ru')
        self.embedding_model = AutoModel.from_pretrained('sberbank-ai/sbert_large_nlu_ru').to(self.device)
        
        spacy.prefer_gpu()
        self.nlp = spacy.load(spacy_model, disable=["ner", "tagger", "attribute_ruler", "lemmatizer"])
        
        self.df = pd.read_csv(self.csv_path, nrows=1000)

    @staticmethod
    def clean_text(text):
        text = re.sub(r'[\n\r\t]+|\s{2,}', ' ', text)
        text = re.sub(r'(?<!\.)\s*\.\s*|\s*\.\s*(?!\.)', '. ', text)
        return text.strip().rstrip('.')

    def split_reviews_into_sentences(self, batch):
        cleaned_texts = [self.clean_text(text) for text in batch['corrected_text']]
        docs = list(self.nlp.pipe(cleaned_texts, batch_size=64))
        batch['sentences'] = [[sent.text for sent in doc.sents] for doc in docs]
        return batch

    def process_reviews(self):
        dataset = Dataset.from_pandas(self.df)
        dataset = dataset.map(self.split_reviews_into_sentences, batched=True, batch_size=32)
        self.df = dataset.to_pandas()
        df_exploded = self.df.explode('sentences').reset_index(drop=True)
        df_exploded = df_exploded.drop(columns=[col for col in df_exploded.columns if col.startswith('__index_level_')])
        return Dataset.from_pandas(df_exploded)

    def compute_sentence_embeddings(self, sentences):
        sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
        if not sentences:
            raise ValueError("Input contains no valid strings.")
        inputs = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.embedding_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    def compute_embeddings_after_explode(self, batch):
        sentences = batch['sentences']
        valid_sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
        if not valid_sentences:
            batch['sentence_embeddings'] = [[]] * len(sentences)
            return batch
        embeddings = self.compute_sentence_embeddings(valid_sentences)
        embeddings = embeddings.astype(np.float32)
        final_embeddings = []
        embed_idx = 0
        for sentence in sentences:
            if isinstance(sentence, str):
                final_embeddings.append(embeddings[embed_idx])
                embed_idx += 1
            else:
                final_embeddings.append(np.zeros(embeddings.shape[1], dtype=np.float32))
        batch['sentence_embeddings'] = final_embeddings
        return batch

    def apply_embeddings(self, dataset_exploded):
        return dataset_exploded.map(self.compute_embeddings_after_explode, batched=True, batch_size=128)

    def extract_key_thought(self, cluster_sentences):
        sentences = cluster_sentences.split(" | ")
        embeddings = self.compute_sentence_embeddings(sentences)
        centroid = np.mean(embeddings, axis=0)
        similarities = cosine_similarity(embeddings, [centroid])
        key_sentence_index = np.argmax(similarities)
        return sentences[key_sentence_index]

    def count_words(self, cluster_sentences):
        words = cluster_sentences.split()
        return len(words)

    def recluster_large_cluster(self, cluster_sentences, eps=0.1, min_samples=2):
        sentences = cluster_sentences.split(" | ")
        embeddings = self.compute_sentence_embeddings(sentences)
        re_clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(embeddings)
        re_cluster_dict = {}
        for idx, label in enumerate(re_clustering.labels_):
            if label == -1:
                continue
            label_str = str(label)
            if label_str not in re_cluster_dict:
                re_cluster_dict[label_str] = []
            re_cluster_dict[label_str].append(sentences[idx])
        return [" | ".join(cluster) for cluster in re_cluster_dict.values()]

    def recursive_clustering(self, cluster_sentences, threshold, eps=0.22, min_samples=3, min_eps=0.02):
        current_eps = eps
        current_min_samples = min_samples
        new_clusters = [cluster_sentences]
        while True:
            next_clusters = []
            reclustered_any = False
            for cluster in new_clusters:
                if self.count_words(cluster) > threshold:
                    while current_eps >= min_eps:
                        reclustered = self.recluster_large_cluster(cluster, eps=current_eps, min_samples=current_min_samples)
                        if len(reclustered) > 1:
                            next_clusters.extend(reclustered)
                            reclustered_any = True
                            break
                        else:
                            if current_eps > min_eps:
                                current_eps -= 0.05
                    if len(reclustered) == 1:
                        next_clusters.append(cluster)
                else:
                    next_clusters.append(cluster)
            new_clusters = next_clusters
            if not reclustered_any:
                break
        return new_clusters

    def generate_predictions(self, dataset_exploded):
        tokenizer = self.tokenizer_my
        model = self.classification_model
        if self.device == torch.device("cuda"):
            model = model.half()

        reviews = dataset_exploded["sentences"]
        reviews = [str(review) for review in reviews if isinstance(review, str) and review.strip()]

        class ReviewDataset(TorchDataset):
            def __init__(self, reviews, tokenizer, max_len=128):
                self.reviews = reviews
                self.tokenizer = tokenizer
                self.max_len = max_len

            def __len__(self):
                return len(self.reviews)

            def __getitem__(self, idx):
                review = self.reviews[idx]
                encoding = self.tokenizer.encode_plus(
                    review,
                    add_special_tokens=True,
                    max_length=self.max_len,
                    return_token_type_ids=False,
                    padding='max_length',
                    truncation=True,
                    return_attention_mask=True,
                    return_tensors='pt'
                )
                return {key: val.flatten() for key, val in encoding.items()}

        dataset = ReviewDataset(reviews, tokenizer)
        batch_size = 32
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

        predictions = []

        from torch.cuda.amp import autocast

        for batch in tqdm(dataloader, desc="–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤"):
            batch = {key: val.to(self.device) for key, val in batch.items()}
            
            with torch.no_grad():
                with autocast():  # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Å–º–µ—à–∞–Ω–Ω—É—é —Ç–æ—á–Ω–æ—Å—Ç—å
                    outputs = model(**batch)
                    logits = outputs[0] if isinstance(outputs, tuple) else outputs.logits
                    probabilities = torch.softmax(logits, dim=-1)
                    batch_predictions = (probabilities[:, 1] > 0.7).cpu().numpy()  # –ò—Å–ø–æ–ª—å–∑—É–µ–º –ø–æ—Ä–æ–≥ 0.7
                    predictions.extend(batch_predictions)

        if len(predictions) != len(dataset_exploded):
            print(f"Warning: Length of predictions ({len(predictions)}) does not match length of index ({len(dataset_exploded)})")
            if len(predictions) < len(dataset_exploded):
                missing_count = len(dataset_exploded) - len(predictions)
                predictions.extend([0] * missing_count)
            elif len(predictions) > len(dataset_exploded):
                predictions = predictions[:len(dataset_exploded)]
        dataset_exploded = dataset_exploded.add_column("predictions", predictions)
        return dataset_exploded

    def process_group(self, category_name, product_name, group):
        all_sentences = group['sentences'].tolist()
        if not all_sentences:
            return pd.DataFrame()

        try:
            all_embeddings = self.compute_sentence_embeddings(all_sentences)
        except ValueError as e:
            print(f"Error in computing embeddings for product {product_name}: {e}")
            return pd.DataFrame()

        distance_matrix = squareform(pdist(all_embeddings, metric='cosine'))
        clustering = hdbscan.HDBSCAN(min_samples=3, metric='precomputed').fit(distance_matrix)

        cluster_dict = {}
        for idx, label in enumerate(clustering.labels_):
            if label == -1:
                continue
            label_str = str(label)
            if label_str not in cluster_dict:
                cluster_dict[label_str] = set()
            cluster_dict[label_str].add(all_sentences[idx])

        clusters = [" | ".join(sentences) for sentences in cluster_dict.values()]

        if not clusters:
            return pd.DataFrame()

        group['binary_rating'] = group['review_rating'].apply(lambda x: 1 if x in [4, 5] else 0)
        avg_rating = group['binary_rating'].mean()
        rating_category = 'positive' if avg_rating > 0.7 else 'neutral'
        rating_category = 'neutral' if avg_rating > 0.5 else 'negative'

        threshold = self.determine_threshold(clusters)

        final_clusters = []
        for cluster in clusters:
            if self.count_words(cluster) > threshold:
                final_clusters.extend(self.recursive_clustering(cluster, threshold))
            else:
                final_clusters.append(cluster)

        # –û–±–µ—Å–ø–µ—á–µ–Ω–∏–µ –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–≥–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
        final_clusters = self.ensure_minimum_clusters(final_clusters, threshold)

        df_exploded_sorted = pd.DataFrame({
            'category': category_name,
            'product': product_name,
            'avg_rating': avg_rating,
            'rating_category': rating_category,
            'cluster_sentences': final_clusters
        })
        df_exploded_sorted['word_count'] = df_exploded_sorted['cluster_sentences'].apply(self.count_words)
        df_exploded_sorted['key_thought'] = df_exploded_sorted['cluster_sentences'].apply(self.extract_key_thought)
        df_exploded_sorted = df_exploded_sorted.sort_values(by='word_count', ascending=False)

        return df_exploded_sorted

    def determine_threshold(self, clusters):
        if len(clusters) == 1:
            cluster_word_count = self.count_words(clusters[0])
            if cluster_word_count > 20:
                return cluster_word_count / 2
            return cluster_word_count
        return np.min([np.mean([self.count_words(cluster) for cluster in clusters]) * 1.5, 250])

    def ensure_minimum_clusters(self, final_clusters, threshold):
        while len(final_clusters) < 3 and any(self.count_words(cluster) > threshold for cluster in final_clusters):
            largest_cluster = max(final_clusters, key=self.count_words)
            final_clusters.remove(largest_cluster)
            new_clusters = self.recursive_clustering(largest_cluster, threshold)
            if len(new_clusters) <= 1:
                final_clusters.append(largest_cluster)
                break
            final_clusters.extend(new_clusters)
        return final_clusters
    
    def cluster_reviews(self, dataset_exploded):
        # –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
        dataset_filtered = dataset_exploded.filter(lambda x: x['predictions'] == 1)
        
        # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ pandas DataFrame –¥–ª—è –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∏
        df_filtered = dataset_filtered.to_pandas()
        grouped = df_filtered.groupby(['category', 'product'])

        results = []
        
        # –ü–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ –±–µ–∑ –ø–∞—Ä–∞–ª–ª–µ–ª–∏–∑–º–∞
        for (category_name, product_name), group in tqdm(grouped, desc="Processing categories and products"):
            result_df = self.process_group(category_name, product_name, group)
            if not result_df.empty:
                results.append(result_df)

        if results:  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ —Å–ø–∏—Å–æ–∫ results –Ω–µ –ø—É—Å—Ç
            final_result = pd.concat(results, ignore_index=True)
            final_result = final_result[((final_result['word_count'] > 10) & (final_result['key_thought'].str.len() > 5))]
            final_result.to_csv("./reviews_keywords/feedbackfueltest.csv")
        else:
            print("No valid results to concatenate. Returning an empty DataFrame.")
            final_result = pd.DataFrame()  # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç–æ–π DataFrame, –µ—Å–ª–∏ –Ω–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏—è
        
        return final_result

    def run(self):
        dataset_exploded = self.process_reviews()
        dataset_exploded = self.apply_embeddings(dataset_exploded)
        dataset_exploded = self.generate_predictions(dataset_exploded)
        result = self.cluster_reviews(dataset_exploded)
        return result


reviews_keywords = ReviewsKeywords(csv_path="./reviews_keywords/wildberries_reviews.csv",
                                    model_path='./reviews_keywords/fine_tuned_model')
final_result = reviews_keywords.run()
final_result.head()

ValueError: could not determine the shape of object type 'torch.storage.UntypedStorage'

In [None]:
final_result

Unnamed: 0,category,product,avg_rating,rating_category,cluster_sentences,word_count,key_thought
0,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,"–ü–µ—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–µ—Å–æ –∑–∞–∫—Ä—ã–ª–æ—Å—å –≤ —Å–Ω–µ–≥—É, –ø–æ–¥–ª–æ–∂–∏–ª–∏ –ø...",40,"–ü–µ—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–µ—Å–æ –∑–∞–∫—Ä—ã–ª–æ—Å—å –≤ —Å–Ω–µ–≥—É, –ø–æ–¥–ª–æ–∂–∏–ª–∏ –ø..."
1,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ. | –í—Ä–æ–¥–µ –ø—Ä–æ—á–Ω—ã–µ. | –ù–∞ –≤–∏–¥ –ø—Ä–æ—á...,12,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ.
3,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,0.842975,neutral,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª | –í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞....,37,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª
4,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,"–†–µ–∫–æ–º–µ–Ω–¥—É—é, –±—É–¥—É –±—Ä–∞—Ç—å –µ—â–µ | –ó–∞–∫–∞–∂—É | –º—ã–ú—ã–æ—á–Ω–æ...",20,–ë—É–¥—É –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –µ—â—ë.
5,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,–ï–ª–µ –ø–∞—Ö–Ω–µ—Ç. | –û–Ω –¥–∞–∂–µ –Ω–µ –ø–∞—Ö–Ω–µ—Ç. | –ü–∞—Ö–Ω–µ—Ç –∫–∞–∫–∏...,13,–ï–ª–µ –ø–∞—Ö–Ω–µ—Ç.
6,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,üî•üî•üî•üî•üî•üî• –∑–∞–ø–∞—Ö. | –ó–∞–ø–∞—Ö –æ–≥–æ–Ω—å) | –ó–∞–ø–∞—Ö –æ–≥–æ–Ω—å!!!!...,11,–ó–∞–ø–∞—Ö üî•!!!
7,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,–£–±–∏—Ä–∞–µ—Ç —Ä–∂–∞–≤—á–∏–Ω—É —Ö–æ—Ä–æ—à–æ —á–µ—Ä–µ–∑ 10-20 –º–∏–Ω—É—Ç | –°–æ...,76,–†–∂–∞–≤—á–∏–Ω—É —É–±–∏—Ä–∞–µ—Ç –æ—Ç–ª–∏—á–Ω–æ.
8,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,"–†–∂–∞–≤—á–∏–Ω–∞ —É–∂–µ —Ö–æ—Ä–æ—à–æ –≤—ä–µ–ª–∞—Å—å, –ø—Ä–∏—à–ª–æ—Å—å –Ω–µ—Å–∫–æ–ª—å–∫...",38,"–†–∂–∞–≤—á–∏–Ω–∞ —É–∂–µ —Ö–æ—Ä–æ—à–æ –≤—ä–µ–ª–∞—Å—å, –ø—Ä–∏—à–ª–æ—Å—å –Ω–µ—Å–∫–æ–ª—å–∫..."
9,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,"–§–æ—Ç–æ ¬´–¥–æ¬ª –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ —Å–¥–µ–ª–∞–ª–∞, —Ç–æ–ª—å–∫–æ ¬´–ø–æ—Å–ª...",24,"–§–æ—Ç–æ ¬´–¥–æ¬ª –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ —Å–¥–µ–ª–∞–ª–∞, —Ç–æ–ª—å–∫–æ ¬´–ø–æ—Å–ª–µ¬ª"
10,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,–í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞. | –í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª...,14,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª


In [None]:
final_result

Unnamed: 0,category,product,avg_rating,rating_category,cluster_sentences,word_count,key_thought
0,/–°–ø–æ—Ä—Ç/–°—Ç—Ä–∞–π–∫–±–æ–ª –∏ –ø–µ–π–Ω—Ç–±–æ–ª/–ê–∫—Å–µ—Å—Å—É–∞—Ä—ã,karbi / –†—é–∫–∑–∞–∫ —Ç–∞–∫—Ç–∏—á–µ—Å–∫–∏–π —Ç—É—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∏–π - –∫–∞—Ä...,0.797101,neutral,"–ú–Ω–æ–≥–æ –¥–æ–ø –∫–∞—Ä–º–∞–Ω–æ–≤, —á–µ—Ö–æ–ª –æ—Ç –¥–æ–∂–¥—è, –ø—Ä–æ—Ä–µ–∑–∏–Ω–µ–Ω...",203,"–†—é–∫–∑–∞–∫ –≤–º–µ—Å—Ç–∏—Ç–µ–ª—å–Ω—ã–π, –ø—Ä–æ—á–Ω—ã–π, –µ—Å—Ç—å –∑–∞—â–∏—Ç–Ω—ã–π —á..."
1,/–°–ø–æ—Ä—Ç/–°—Ç—Ä–∞–π–∫–±–æ–ª –∏ –ø–µ–π–Ω—Ç–±–æ–ª/–ê–∫—Å–µ—Å—Å—É–∞—Ä—ã,karbi / –†—é–∫–∑–∞–∫ —Ç–∞–∫—Ç–∏—á–µ—Å–∫–∏–π —Ç—É—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∏–π - –∫–∞—Ä...,0.797101,neutral,"–í –ø–æ–¥–∞—Ä–æ–∫ —à—ë–ª –∫–æ–º–ø–∞—Å,, –Ω–∞–ª–æ–±–Ω—ã–π—Ñ–æ–Ω–∞—Ä—å,, –Ω–æ–∂–∞–Ω–µ...",69,"–í –ø–æ–¥–∞—Ä–æ–∫ –ø–æ–ª–æ–∂–∏–ª–∏ —Ñ–æ–Ω–∞—Ä–∏–∫ –Ω–∞–ª–æ–±–Ω—ã–π, –∫–æ–º–ø–∞—Å –∏ ..."


# –≠—Ç–∞–ø 1

In [50]:
import cudf.pandas  # –ò–º–ø–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ cuDF –∏ –∞–∫—Ç–∏–≤–∞—Ü–∏—è –µ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
cudf.pandas.install()  # –£—Å—Ç–∞–Ω–æ–≤–∫–∞ cuDF –∫–∞–∫ –æ—Å–Ω–æ–≤–Ω–æ–≥–æ –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–∞ –¥–ª—è pandas
import os
import pandas as pd
from tqdm import tqdm
import torch
import pyarrow.parquet as pq
import dask.dataframe as dd

# # –ß—Ç–µ–Ω–∏–µ Parquet-—Ñ–∞–π–ª–∞ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º pyarrow
# table = pq.read_table('./reviews_keywords/wildberries_reviews_corrected.parquet')

# # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ pandas DataFrame
# df_pandas = table.to_pandas()

# # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ pandas DataFrame –≤ Dask DataFrame
# df_dask = dd.from_pandas(df_pandas, npartitions=100)  # –£–∫–∞–∂–∏—Ç–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –Ω—É–∂–Ω—ã—Ö –≤–∞–º —á–∞—Å—Ç–µ–π
# df_pandas = None
# table = None
# import gc
# gc.collect()
# df_dask

In [51]:
result = pd.read_csv("./reviews_keywords/wildberries_reviews.csv", nrows=1000)
result.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Unnamed: 0        1000 non-null   int64
 1   review_full_text  1000 non-null   object
 2   review_rating     1000 non-null   int64
 3   product           1000 non-null   object
 4   category          1000 non-null   object
 5   url               1000 non-null   object
 6   corrected_text    1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 540.9+ KB


In [52]:
# –û—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ –ø–æ 5 –∑–∞–ø–∏—Å–µ–π –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —É–Ω–∏–∫–∞–ª—å–Ω–æ–≥–æ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Å—Ç–æ–ª–±—Ü–µ 'product'
# result_limited = result.groupby('product').apply(lambda x: x.iloc[5:8]).reset_index(drop=True)
result_limited = result


In [79]:
import spacy
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast
import torch
from sklearn.cluster import DBSCAN
import numpy as np
from collections import Counter

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–æ—Å—Ç—É–ø–Ω–æ—Å—Ç–∏ GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
import hdbscan
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–æ–æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
# –ó–∞–≥—Ä—É–∂–∞–µ–º –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é –º–æ–¥–µ–ª–∏


# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞ –æ—Ç –°–±–µ—Ä–±–∞–Ω–∫–∞
tokenizer = BertTokenizerFast.from_pretrained("./reviews_keywords/fine_tuned_model")
model = AutoModel.from_pretrained("./reviews_keywords/fine_tuned_model").to(device)
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å —Å –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–µ–π
config = BertConfig.from_pretrained('./reviews_keywords/fine_tuned_model', output_hidden_states=True)
model_classification = BertForSequenceClassification.from_pretrained('./reviews_keywords/fine_tuned_model', config=config).to(device)

        # self.tokenizer_my = BertTokenizerFast.from_pretrained(self.model_path)
        #  # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏
        # self.classification_model = BertForSequenceClassification.from_pretrained(self.model_path).to(self.device)
        # # –ó–∞–≥—Ä—É–∑–∫–∞ –±–∞–∑–æ–≤–æ–π –º–æ–¥–µ–ª–∏ –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        # self.embedding_model = AutoModel.from_pretrained(self.model_path).to(self.device)

spacy.prefer_gpu()
# –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –Ω–∞—Å—Ç—Ä–æ–π–∫–∞ –º–æ–¥–µ–ª–∏ SpaCy
nlp = spacy.load("ru_core_news_lg", disable=["ner", "tagger", "attribute_ruler", "lemmatizer"])

df = result_limited

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ pandas DataFrame –≤ Hugging Face Dataset
dataset = Dataset.from_pandas(df)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [80]:
import re

def clean_text(text):
    text = re.sub(r'[\n\r\t]+|\s{2,}', ' ', text)  # –û–±—ä–µ–¥–∏–Ω—è–µ–º —à–∞–≥–∏ –¥–ª—è –∑–∞–º–µ–Ω—ã –ø—Ä–æ–±–µ–ª–æ–≤
    text = re.sub(r'(?<!\.)\s*\.\s*|\s*\.\s*(?!\.)', '. ', text)  # –û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è –∑–∞–º–µ–Ω—ã —Ç–æ—á–∫–∏
    return text.strip().rstrip('.')

def split_by_syntax(doc, max_length=100):
    """
    –†–∞–∑–¥–µ–ª—è–µ—Ç –¥–ª–∏–Ω–Ω—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –Ω–∞ –±–æ–ª–µ–µ –∫–æ—Ä–æ—Ç–∫–∏–µ, –æ—Å–Ω–æ–≤—ã–≤–∞—è—Å—å –Ω–∞ —Å–∏–Ω—Ç–∞–∫—Å–∏—á–µ—Å–∫–æ–º –∞–Ω–∞–ª–∏–∑–µ.
    """
    split_sentences = []
    current_chunk = []
    if len(doc.text) < 100:
        return [doc.text]
    for token in doc:
        current_chunk.append(token.text)
        
        # –ü—Ä–æ–≤–µ—Ä–∫–∞, –º–æ–∂–Ω–æ –ª–∏ –∑–∞–≤–µ—Ä—à–∏—Ç—å –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ
        if token.dep_ in ('punct', 'cc', 'conj') or len(' '.join(current_chunk)) > max_length:
            if len(' '.join(current_chunk)) > max_length:
                split_sentences.append(' '.join(current_chunk[:-1]))
                current_chunk = [current_chunk[-1]]
            else:
                split_sentences.append(' '.join(current_chunk))
                current_chunk = []

    # –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ –æ—Å—Ç–∞–≤—à–µ–≥–æ—Å—è —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∞
    if current_chunk:
        split_sentences.append(' '.join(current_chunk))

    return split_sentences

def split_reviews_into_sentences(batch):
    # –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
    cleaned_texts = [clean_text(text) for text in batch['corrected_text']]
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤ —Å –ø–æ–º–æ—â—å—é nlp.pipe —Å —É–∫–∞–∑–∞–Ω–∏–µ–º batch_size
    docs = list(nlp.pipe(cleaned_texts, batch_size=64))

    # –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –∏ –∏—Ö –¥–∞–ª—å–Ω–µ–π—à–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
    processed_sentences = []
    for doc in docs:
        sentences = []
        for sent in doc.sents:
            # –†–∞–∑–¥–µ–ª—è–µ–º –¥–ª–∏–Ω–Ω—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –ø–æ —Å–∏–Ω—Ç–∞–∫—Å–∏—Å—É
            split_sentences = split_by_syntax(sent, max_length=100)
            sentences.extend(split_sentences)
        processed_sentences.append(sentences)
    
    batch['sentences'] = processed_sentences
    
    return batch

dataset = dataset.map(split_reviews_into_sentences, batched=True, batch_size=32)

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º Dataset –æ–±—Ä–∞—Ç–Ω–æ –≤ pandas DataFrame
df = dataset.to_pandas()

# –í—ã–ø–æ–ª–Ω–∏–º explode –ø–æ —Å—Ç–æ–ª–±—Ü—É —Å –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è–º–∏
df_exploded = df.explode('sentences').reset_index(drop=True)

# –£–¥–∞–ª—è–µ–º –ª–∏—à–Ω–∏–µ —Å—Ç–æ–ª–±—Ü—ã, –∫–æ—Ç–æ—Ä—ã–µ –ø–æ—è–≤–∏–ª–∏—Å—å –ø–æ—Å–ª–µ explode
df_exploded = df_exploded.drop(columns=[col for col in df_exploded.columns if col.startswith('__index_level_')])

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º DataFrame –æ–±—Ä–∞—Ç–Ω–æ –≤ Hugging Face Dataset
dataset_exploded = Dataset.from_pandas(df_exploded)

from torch.cuda.amp import autocast

def compute_sentence_embeddings(sentences):
    # –§–∏–ª—å—Ç—Ä—É–µ–º —Å–ø–∏—Å–æ–∫, –æ—Å—Ç–∞–≤–ª—è—è —Ç–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫–∏
    sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
    
    if not sentences:
        raise ValueError("Input contains no valid strings.")

    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    
    with torch.no_grad():
        with autocast():  # –ò—Å–ø–æ–ª—å–∑—É–µ–º mixed precision –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
            outputs = model(**inputs)
    
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def compute_embeddings_after_explode(batch):
    sentences = batch['sentences']

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –≤—Å–µ —ç–ª–µ–º–µ–Ω—Ç—ã –≤ batch —è–≤–ª—è—é—Ç—Å—è —Å—Ç—Ä–æ–∫–∞–º–∏
    valid_sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
    
    if not valid_sentences:
        batch['sentence_embeddings'] = [[]] * len(sentences)  # –ï—Å–ª–∏ –Ω–µ—Ç –≤–∞–ª–∏–¥–Ω—ã—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π, –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç—ã–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
        return batch

    embeddings = compute_sentence_embeddings(valid_sentences)

    # –ü—Ä–∏–≤–µ–¥–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –∫ —Ç–∏–ø—É float32 –¥–ª—è –∫–æ–Ω—Å–∏—Å—Ç–µ–Ω—Ç–Ω–æ—Å—Ç–∏
    embeddings = embeddings.astype(np.float32)

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π
    if len(embeddings) != len(valid_sentences):
        raise ValueError("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π.")
    
    # –ï—Å–ª–∏ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏ –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –∏—Å—Ö–æ–¥–Ω—ã–º, –∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä—É–µ–º –≤—ã—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ
    final_embeddings = []
    embed_idx = 0
    for sentence in sentences:
        if isinstance(sentence, str):
            final_embeddings.append(embeddings[embed_idx])
            embed_idx += 1
        else:
            final_embeddings.append(np.zeros(embeddings.shape[1], dtype=np.float32))  # –î–æ–±–∞–≤–ª—è–µ–º –Ω—É–ª–µ–≤—ã–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –Ω–µ–≤–∞–ª–∏–¥–Ω—ã—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π

    batch['sentence_embeddings'] = final_embeddings
    return batch

# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —Ñ—É–Ω–∫—Ü–∏–∏
dataset = dataset_exploded.map(compute_embeddings_after_explode, batched=True, batch_size=128)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3243 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [81]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification
from tqdm import tqdm
import os
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
import nltk
from nltk.corpus import stopwords
import spacy
from tqdm import tqdm
import logging
import hdbscan  # HDBSCAN –¥–ª—è –±–æ–ª–µ–µ —Å—Ç–∞–±–∏–ª—å–Ω–æ–π –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ —Å –ø–æ–¥–¥–µ—Ä–∂–∫–æ–π –∫–∞—Å—Ç–æ–º–Ω—ã—Ö –º–µ—Ç—Ä–∏–∫
from scipy.spatial.distance import pdist, squareform

In [82]:
# –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ (GPU –∏–ª–∏ CPU)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# –ü–µ—Ä–µ–≤–æ–¥ –º–æ–¥–µ–ª–∏ –≤ —Ä–µ–∂–∏–º FP16, –µ—Å–ª–∏ —ç—Ç–æ –≤–æ–∑–º–æ–∂–Ω–æ
if use_cuda:
    model_classification = model_classification.half()

# –ü—Ä–∏–º–µ—Ä –¥–∞–Ω–Ω—ã—Ö (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ)
reviews = dataset_exploded["sentences"]

# –û—á–∏—Å—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –æ—Ç –Ω–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π
reviews = [str(review) for review in reviews if isinstance(review, str) and review.strip()]

# –°–æ–∑–¥–∞–Ω–∏–µ –∫–∞—Å—Ç–æ–º–Ω–æ–≥–æ Dataset –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ –æ—Ç–∑—ã–≤–æ–≤
class ReviewDataset(Dataset):
    def __init__(self, reviews, tokenizer, max_len=128):
        self.reviews = reviews
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {key: val.flatten() for key, val in encoding.items()}

# –°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç –∏ DataLoader
dataset = ReviewDataset(reviews, tokenizer)
batch_size = 32  # –†–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –º–æ–∂–Ω–æ –∏–∑–º–µ–Ω–∏—Ç—å –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç –æ–±—ä–µ–º–∞ –¥–æ—Å—Ç—É–ø–Ω–æ–π –ø–∞–º—è—Ç–∏ GPU
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# –ü–æ–ª—É—á–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π —Å –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏–µ–º –ø—Ä–æ–≥—Ä–µ—Å—Å–∞
predictions = []

from torch.cuda.amp import autocast  # –ò–º–ø–æ—Ä—Ç–∏—Ä—É–µ–º autocast –¥–ª—è —Å–º–µ—à–∞–Ω–Ω–æ–π —Ç–æ—á–Ω–æ—Å—Ç–∏

for batch in tqdm(dataloader, desc="–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤"):
    batch = {key: val.to(device) for key, val in batch.items()}
    
    with torch.no_grad():
        with autocast():  # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Å–º–µ—à–∞–Ω–Ω—É—é —Ç–æ—á–Ω–æ—Å—Ç—å
            outputs = model_classification(**batch)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            batch_predictions = (probabilities[:, 1] > 0.7).cpu().numpy()  # –ò—Å–ø–æ–ª—å–∑—É–µ–º –ø–æ—Ä–æ–≥ 0.7
            predictions.extend(batch_predictions)

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ DataFrame, –µ—Å–ª–∏ —ç—Ç–æ –µ—â–µ –Ω–µ —Å–¥–µ–ª–∞–Ω–æ
if not isinstance(dataset_exploded, pd.DataFrame):
    dataset_exploded = pd.DataFrame(dataset_exploded)

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞ –Ω–µ—Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏—è –¥–ª–∏–Ω—ã
if len(predictions) != len(dataset_exploded):
    print(f"Warning: Length of predictions ({len(predictions)}) does not match length of index ({len(dataset_exploded)})")
    
    # –ü—Ä–∏–º–µ—Ä: –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ –Ω–µ–¥–æ—Å—Ç–∞—é—â–∏—Ö –∑–Ω–∞—á–µ–Ω–∏–π
    if len(predictions) < len(dataset_exploded):
        missing_count = len(dataset_exploded) - len(predictions)
        predictions.extend([0] * missing_count)  # –î–æ–±–∞–≤–ª—è–µ–º –Ω—É–ª–∏ –≤ —Å–ª—É—á–∞–µ –Ω–µ–¥–æ—Å—Ç–∞—Ç–∫–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π

    elif len(predictions) > len(dataset_exploded):
        predictions = predictions[:len(dataset_exploded)]  # –û–±—Ä–µ–∑–∞–µ–º —Å–ø–∏—Å–æ–∫ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π

# –ü—Ä–∏—Å–æ–µ–¥–∏–Ω–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –∫ –¥–∞—Ç–∞—Å–µ—Ç—É
dataset_exploded['predictions'] = predictions
dataset_exploded.head()



–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤:   0%|                                                                                                                                                                       | 0/102 [00:00<?, ?it/s]

–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 102/102 [00:16<00:00,  6.16it/s]


Unnamed: 0.1,Unnamed: 0,review_full_text,review_rating,product,category,url,corrected_text,sentences,__index_level_0__,predictions
0,0,–†–∞–±–æ—Ç–∞–µ—Ç —Ö–æ—Ä–æ—à–æ.,5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,–†–∞–±–æ—Ç–∞–µ—Ç —Ö–æ—Ä–æ—à–æ.,–†–∞–±–æ—Ç–∞–µ—Ç —Ö–æ—Ä–æ—à–æ,0,False
1,1,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...","–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥.",1,False
2,1,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",–ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏—Å–ø—ã—Ç—ã–≤–∞—Ç—å,2,True
3,2,"–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...","–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...",3,True
4,3,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è.,4,True


In [83]:
# –ì—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∞ –ø–æ 'corrected_text' –∏ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –≤ 'sentences'
# df_filtered_reviews = dataset_exploded[dataset_exploded["predictions"] == 1].groupby(["review_full_text", "review_rating", "product", "category", "url", "corrected_text"])['sentences'].apply(lambda x: " ".join(x)).reset_index()
df_filtered_reviews = dataset_exploded[dataset_exploded["predictions"] == 1][["review_full_text", "review_rating", "product", "category", "url", "corrected_text", 'sentences']]
df_filtered_reviews

Unnamed: 0,review_full_text,review_rating,product,category,url,corrected_text,sentences
2,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",–ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏—Å–ø—ã—Ç—ã–≤–∞—Ç—å
3,"–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...","–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ..."
4,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è.
5,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,–ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ —Å–±–æ—Ä–∫—É –∏ –∫—Ä–µ–ø–ª–µ–Ω–∏...
7,–ú—É–∂ –µ—â—ë –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª,5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,–ú—É–∂ –µ—â—ë –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª,–ú—É–∂ –µ—â—ë –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª
...,...,...,...,...,...,...,...
3228,–°—Ä–µ–¥—Å—Ç–≤–æ —Ä–∞–±–æ—á–µ–µ –Ω–∞–¥–æ –±—Ä–∞—Ç—å,5,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,https://www.wildberries.ru/catalog/155565431/f...,–°—Ä–µ–¥—Å—Ç–≤–æ —Ä–∞–±–æ—á–µ–µ –Ω–∞–¥–æ –±—Ä–∞—Ç—å,–°—Ä–µ–¥—Å—Ç–≤–æ —Ä–∞–±–æ—á–µ–µ –Ω–∞–¥–æ –±—Ä–∞—Ç—å
3235,–•–æ—Ä–æ—à–æ —É–±–∏—Ä–∞–µ—Ç —Ä–∂–∞–≤—á–∏–Ω—É,5,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,https://www.wildberries.ru/catalog/155565431/f...,–•–æ—Ä–æ—à–æ —É–±–∏—Ä–∞–µ—Ç —Ä–∂–∞–≤—á–∏–Ω—É,–•–æ—Ä–æ—à–æ —É–±–∏—Ä–∞–µ—Ç —Ä–∂–∞–≤—á–∏–Ω—É
3239,"—Ç–∞–∫ –Ω–µ –æ —á–µ–º, —ç—Ñ—Ñ–µ–∫—Ç–∞ –ø–æ—á—Ç–∏ –Ω–µ—Ç, –±–µ—Å—Ç–æ–ª–∫–æ–≤–∞—è —Ö...",3,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,https://www.wildberries.ru/catalog/155565431/f...,"–¢–∞–∫ –Ω–µ –æ —á–µ–º, —ç—Ñ—Ñ–µ–∫—Ç–∞ –ø–æ—á—Ç–∏ –Ω–µ—Ç, –±–µ—Å—Ç–æ–ª–∫–æ–≤–∞—è —Ö...","–¢–∞–∫ –Ω–µ –æ —á–µ–º, —ç—Ñ—Ñ–µ–∫—Ç–∞ –ø–æ—á—Ç–∏ –Ω–µ—Ç, –±–µ—Å—Ç–æ–ª–∫–æ–≤–∞—è —Ö..."
3240,–ü–æ—Å–º–æ—Ç—Ä–∏–º —á—Ç–æ –∏ –∫–∞–∫ –±—É–¥–µ—Ç,5,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,https://www.wildberries.ru/catalog/155565431/f...,–ü–æ—Å–º–æ—Ç—Ä–∏–º —á—Ç–æ –∏ –∫–∞–∫ –±—É–¥–µ—Ç,–ü–æ—Å–º–æ—Ç—Ä–∏–º —á—Ç–æ –∏ –∫–∞–∫ –±—É–¥–µ—Ç


In [84]:
df_filtered_reviews["corrected_text"] = df_filtered_reviews["corrected_text"].astype(str)
df_sorted_by_length = df_filtered_reviews.sort_values(by="corrected_text", key=lambda x: x.str.len(), ascending=False)
df_sorted_by_length.iloc[0]["corrected_text"]

'–ù–µ –ø–µ—Ä–≤–∞—è –ª–µ–±–µ–¥–∫–∞ –∏–∑ –ø–æ–¥–Ω–µ–±–µ—Å–Ω–æ–π. –ü–µ—Ä–µ–¥ —É—Å—Ç–∞–Ω–æ–≤–∫–æ–π —Ä–∞–∑–æ–±—Ä–∞–ª, –ø–æ—Å–º–æ—Ç—Ä–µ–ª —Ä–µ–∞–ª—å–Ω–æ –ª–∏ –≤–ª–∞–≥–æ–∑–∞—â–∏—â–µ–Ω–Ω–∞—è - —Ä–µ–∞–ª—å–Ω–æ, –µ—Å—Ç—å —Å–∞–ª—å–Ω–∏–∫–∏! –ï—Å–ª–∏ –Ω–µ —Ç–æ–ø–∏—Ç—å –µ–µ –≤ –≤–æ–¥–µ –ø–æ–ª–Ω–æ—Å—Ç—å—é, —Ç–æ –≤–ª–∞–≥–∏ –≤ –Ω–µ–π –Ω–µ –±—É–¥–µ—Ç, –æ—Ç –±—Ä—ã–∑–≥, –¥–æ–∂–¥—è, –æ–±–∏–ª—å–Ω–æ–≥–æ –ø–æ–ª–∏–≤–∞–Ω–∏—è –≤–æ–¥–æ–π —Å–≤–µ—Ä—Ö—É –∑–∞—â–∏—Ç—ã –±—É–¥–µ—Ç –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ. –°–º–∞–∑–∫–∏ –≤ —Ä–µ–¥—É–∫—Ç–æ—Ä–µ –±—ã–ª–æ –Ω–µ –º–∞–ª–æ, –Ω–æ –≤—Å–µ —Ä–∞–≤–Ω–æ –¥–æ–±–∞–≤–∏–ª –µ—â–µ. –°–æ–ª–µ–Ω–æ–∏–¥—ã –≤ –±–ª–æ–∫–µ —É–ø—Ä–∞–≤–ª–µ–Ω–∏—è —Å–∞–º—ã–µ –ø—Ä–æ—Å—Ç—ã–µ, –¥–ª—è —Ä—ã–±–∞–∫–æ–≤ –∏ –æ—Ö–æ—Ç–Ω–∏–∫–æ–≤ –ø–æ–π–¥–µ—Ç (–µ—Å–ª–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç—å—Å—è –µ–π —Ä–µ–¥–∫–æ). –ë–ª–æ–∫ —Å–æ–ª–µ–Ω–æ–∏–¥–æ–≤ –Ω–µ –≤–ª–∞–≥–æ–∑–∞—â–∏—â–µ–Ω - –ø—Ä—è—á—å—Ç–µ –≤ –ø–æ–¥–∫–æ–ø–∞—Ç—å—Å—è –µ–≥–æ –ø–æ–¥–∞–ª—å—à–µ –æ—Ç –≤–ª–∞–≥–∏.  –ü—Ä–æ–≤–æ–¥–∞ —Ä–æ–¥–Ω—ã–µ –∑–∞–º–µ–Ω–∏–ª —Ç.–∫. –Ω–∞ –º–æ–π –≤–∑–≥–ª—è–¥ —Ç–æ–Ω–∫–æ–≤

In [85]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ spaCy –¥–ª—è —Ä—É—Å—Å–∫–æ–≥–æ —è–∑—ã–∫–∞
nlp = spacy.load("ru_core_news_lg")

# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ —Å—Ç–æ–ø-—Å–ª–æ–≤
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
from loguru import logger
logger.add('./reviews_keywords/clustering.log', encoding='utf-8')

3

In [87]:
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset

# –ö–∞—Å—Ç–æ–º–Ω—ã–π Dataset –¥–ª—è DataLoader
class SentencesDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        # –¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è —Å—Ç—Ä–æ–∫–∏
        encoded = self.tokenizer(
            self.sentences[idx],
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_length
        )
        # –í–æ–∑–≤—Ä–∞—â–∞–µ–º input_ids –∏ attention_mask
        return encoded['input_ids'].squeeze(0), encoded['attention_mask'].squeeze(0)

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ –±–∞—Ç—á–µ–π –∏ –ø–æ–ª—É—á–µ–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
def get_embeddings_batchwise(sentences, model, tokenizer, batch_size=32):
    dataset = SentencesDataset(sentences, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_embeddings = []
    model.eval()
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_embeddings.extend(embeddings)
    
    return np.array(all_embeddings)


# –ü–µ—Ä–µ–≤–æ–¥ –º–æ–¥–µ–ª–∏ –≤ —Ä–µ–∂–∏–º FP16, –µ—Å–ª–∏ —ç—Ç–æ –≤–æ–∑–º–æ–∂–Ω–æ
if torch.cuda.is_available():
    model_classification = model_classification.half()

def find_centroid(embeddings):
    return np.mean(embeddings, axis=0)

def extract_key_thought(sentences):
    embeddings = get_embeddings_batchwise(sentences, model, tokenizer, batch_size=32)
    centroid = find_centroid(embeddings)
    similarities = cosine_similarity(embeddings, [centroid])
    key_sentence_index = np.argmax(similarities)
    return sentences[key_sentence_index]

def count_words(cluster_sentences):
    words = " ".join(cluster_sentences).split()
    return len(words)


In [88]:
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è —Ä–µ–∑—é–º–∏—Ä–æ–≤–∞–Ω–∏—è –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ (–µ—Å–ª–∏ –æ–Ω–∞ –¥–æ—Å—Ç—É–ø–Ω–∞)
summarizer = pipeline("summarization", model="IlyaGusev/rugpt3medium_sum_gazeta")

# –§—É–Ω–∫—Ü–∏–∏ –¥–ª—è —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –º–µ—Ç–æ–¥–æ–≤ –∏–∑–≤–ª–µ—á–µ–Ω–∏—è –∫–ª—é—á–µ–≤–æ–π –º—ã—Å–ª–∏
def extract_key_thought_method_1(sentences):
    embeddings = get_embeddings_batchwise(sentences, model, tokenizer, batch_size=32)
    centroid = find_centroid(embeddings)
    similarities = cosine_similarity(embeddings, [centroid])
    key_sentence_index = np.argmax(similarities)
    return sentences[key_sentence_index]

def extract_key_thought_method_2(sentences):
    embeddings = get_embeddings_batchwise(sentences, model, tokenizer, batch_size=32)
    centroid = find_centroid(embeddings)
    similarities = cosine_similarity(embeddings, [centroid])
    top_indices = np.argsort(similarities, axis=0)[-3:]  # –í—ã–±–∏—Ä–∞–µ–º —Ç—Ä–∏ –Ω–∞–∏–±–æ–ª–µ–µ –±–ª–∏–∑–∫–∏—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è
    key_thought = " ".join([sentences[idx] for idx in top_indices.flatten()])
    return key_thought

def extract_key_thought_method_4(sentences):
    embeddings = get_embeddings_batchwise(sentences, model, tokenizer, batch_size=32)
    centroid = find_centroid(embeddings)
    similarities = cosine_similarity(embeddings, [centroid])
    key_sentence_index = np.argmax(similarities)
    key_sentence = sentences[key_sentence_index]

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –Ω–µ –ø—É—Å—Ç–æ–µ –ª–∏ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –∏ –Ω–µ —Å–æ—Å—Ç–æ–∏—Ç –ª–∏ –æ–Ω–æ —Ç–æ–ª—å–∫–æ –∏–∑ —Å—Ç–æ–ø-—Å–ª–æ–≤
    if len(key_sentence.strip()) == 0:
        return "–ü—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –ø—É—Å—Ç–æ–µ –∏–ª–∏ —Å–æ—Å—Ç–æ–∏—Ç —Ç–æ–ª—å–∫–æ –∏–∑ —Å—Ç–æ–ø-—Å–ª–æ–≤."

    vectorizer = CountVectorizer(max_features=5, stop_words=stop_words)  # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Ä—É—Å—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
    X = vectorizer.fit_transform([key_sentence])
    
    if X.shape[1] == 0:
        return f"–û—Å–Ω–æ–≤–Ω–æ–π –º—ã—Å–ª–∏ –Ω–µ—Ç"  # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –∏—Å—Ö–æ–¥–Ω–æ–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ

    key_phrases = " ".join(vectorizer.get_feature_names_out())
    
    return f"–û—Å–Ω–æ–≤–Ω–∞—è –º—ã—Å–ª—å: {key_phrases}."


def extract_key_thought_method_5(sentences):
    embeddings = get_embeddings_batchwise(sentences, model, tokenizer, batch_size=32)
    centroid = find_centroid(embeddings)
    similarities = cosine_similarity(embeddings, [centroid])
    top_indices = np.argsort(similarities, axis=0)[-3:]  # –í—ã–±–∏—Ä–∞–µ–º —Ç—Ä–∏ –Ω–∞–∏–±–æ–ª–µ–µ –±–ª–∏–∑–∫–∏—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è

    G = nx.Graph()
    for idx in top_indices.flatten():
        G.add_node(sentences[idx])
        G.add_edge('Main Idea', sentences[idx])

    # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≥—Ä–∞—Ñ –≤ —Å—Ç—Ä–æ–∫–æ–≤–æ–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ
    concept_map_str = ", ".join([str(node) for node in G.nodes if node != 'Main Idea'])
    return f"–ö–∞—Ä—Ç–∞ –∫–æ–Ω—Ü–µ–ø—Ü–∏–π: {concept_map_str}"



def dynamic_summarization(text, summarizer, max_length_ratio=0.5, min_length=25, max_increase_step=50):
    input_length = len(text.split())
    # –î–∏–Ω–∞–º–∏—á–µ—Å–∫–∏ —É—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º max_length, —á—Ç–æ–±—ã –æ–Ω–æ –Ω–µ –ø—Ä–µ–≤—ã—à–∞–ª–æ –¥–ª–∏–Ω—É –≤—Ö–æ–¥–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞
    max_length = min(int(input_length * max_length_ratio), input_length)
    
    # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º max_length, –µ—Å–ª–∏ —ç—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ
    while True:
        try:
            summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
            return summary[0]['summary_text']
        except ValueError as e:
            if "Input length of input_ids" in str(e):
                max_length += max_increase_step
                if max_length > 800:  # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ —É–≤–µ–ª–∏—á–µ–Ω–∏–µ
                    raise ValueError(f"Cannot process the text even after increasing max_length. Current max_length: {max_length}")
            else:
                raise e

def recursive_summarization(text, summarizer, max_length_ratio=0.5, min_length=25, threshold_length=200, max_increase_step=50):
    if len(text.split()) <= threshold_length:
        return dynamic_summarization(text, summarizer, max_length_ratio, min_length, max_increase_step)
    
    sentences = text.split('. ')
    mid_point = len(sentences) // 2
    
    first_half = recursive_summarization('. '.join(sentences[:mid_point]), summarizer, max_length_ratio, min_length, threshold_length, max_increase_step)
    second_half = recursive_summarization('. '.join(sentences[mid_point:]), summarizer, max_length_ratio, min_length, threshold_length, max_increase_step)
    
    combined_summary = first_half + '. ' + second_half
    return dynamic_summarization(combined_summary, summarizer, max_length_ratio, min_length, max_increase_step)

def extract_key_thought_method_3(sentences):
    text = " ".join(sentences)
    return recursive_summarization(text, summarizer, max_length_ratio=0.5, min_length=25, threshold_length=200, max_increase_step=50)


# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ –≤—Å–µ—Ö –º–µ—Ç–æ–¥–æ–≤ –∫ –∫–∞–∂–¥–æ–º—É –∫–ª–∞—Å—Ç–µ—Ä—É
def apply_key_thought_extraction(df):
    logger.info(f"extract_key_thought_method_1")
    df['Key_Thought_Method_1'] = df['sentences'].apply(extract_key_thought_method_1)
    logger.info(f"extract_key_thought_method_2")
    df['Key_Thought_Method_2'] = df['sentences'].apply(extract_key_thought_method_2)
    # logger.info(f"extract_key_thought_method_3")
    # df['Key_Thought_Method_3'] = df['sentences'].apply(extract_key_thought_method_3)
    logger.info(f"extract_key_thought_method_4")
    df['Key_Thought_Method_4'] = df['sentences'].apply(extract_key_thought_method_4)
    logger.info(f"extract_key_thought_method_5")
    df['Key_Thought_Method_5'] = df['sentences'].apply(extract_key_thought_method_5)
    return df



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'GPT2LMHeadModel' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGenera

In [89]:
!export PYTHONIOENCODING=utf-8

In [90]:
from contextlib import contextmanager
@contextmanager
def suppress_output():
    devnull = os.open(os.devnull, os.O_WRONLY)
    old_stdout = os.dup(1)
    old_stderr = os.dup(2)
    os.dup2(old_stdout, 1)
    os.dup2(old_stderr, 2)
    try:
        yield
    finally:
        os.dup2(old_stdout, 1)
        os.dup2(old_stderr, 2)
        os.close(devnull)

In [96]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import pdist
from contextlib import contextmanager
from tqdm import tqdm
import matplotlib.pyplot as plt

def compute_mean_distance(cluster_indices, embeddings):
    # –ï—Å–ª–∏ –≤ –∫–ª–∞—Å—Ç–µ—Ä–µ –º–µ–Ω—å—à–µ –¥–≤—É—Ö —ç–ª–µ–º–µ–Ω—Ç–æ–≤, —Å—Ä–µ–¥–Ω–µ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –Ω–µ –≤—ã—á–∏—Å–ª—è–µ—Ç—Å—è
    if len(cluster_indices) < 2:
        return 0
    
    # –í—ã—á–∏—Å–ª—è–µ–º –ø–æ–ø–∞—Ä–Ω—ã–µ –∫–æ—Å–∏–Ω—É—Å–Ω—ã–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è –º–µ–∂–¥—É —ç–ª–µ–º–µ–Ω—Ç–∞–º–∏ –∫–ª–∞—Å—Ç–µ—Ä–∞
    cluster_embeddings = embeddings[cluster_indices]
    pairwise_distances = pdist(cluster_embeddings, metric='cosine')
    
    # –í–æ–∑–≤—Ä–∞—â–∞–µ–º —Å—Ä–µ–¥–Ω–µ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ
    return np.mean(pairwise_distances)

def cluster_product_reviews(df, model, tokenizer, batch_size=32, max_distance=0.5, min_cluster_size=10):
    grouped = df.groupby('product')
    final_clusters_list = []

    # logger.info("–ù–∞—á–∞–ª–æ –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ –ø—Ä–æ–¥—É–∫—Ç–æ–≤.")

    for product, group in tqdm(grouped, desc="Processing products"):
        # logger.info(f"–û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–æ–¥—É–∫—Ç–∞: {product}")
        sentences = group["sentences"].tolist()
        embeddings = get_embeddings_batchwise(sentences, model, tokenizer, batch_size=batch_size)
        embeddings = np.array(embeddings, dtype=np.float32)
        
        num_points = len(embeddings)
        # logger.info(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–æ—á–µ–∫ –¥–ª—è –ø—Ä–æ–¥—É–∫—Ç–∞ {product}: {num_points}")

        if num_points < min_cluster_size:
            if num_points == 1:
                initial_clusters = np.array([1] * num_points)
            else:
                initial_clusters = np.array([1] * (num_points // 2) + [2] * (num_points - num_points // 2))
            group['final_cluster'] = initial_clusters
            group['mean_distance'] = 0  # –£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º –∑–Ω–∞—á–µ–Ω–∏–µ –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é
        else:
            # –ò–µ—Ä–∞—Ä—Ö–∏—á–µ—Å–∫–∞—è –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –º–µ—Ç–æ–¥–∞ 'ward'
            distance_matrix = pdist(embeddings, metric='cosine')  # –ò—Å–ø–æ–ª—å–∑—É–µ–º –∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ
            linkage_matrix = linkage(distance_matrix, method='ward')
            initial_clusters = fcluster(linkage_matrix, t=max_distance, criterion='distance')
        
            # logger.info(f"–ù–∞—á–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤ –¥–ª—è –ø—Ä–æ–¥—É–∫—Ç–∞ {product}: {len(np.unique(initial_clusters))}")

            # –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –¥–µ–Ω–¥—Ä–æ–≥—Ä–∞–º–º—ã –¥–ª—è —Ç–µ–∫—É—â–µ–≥–æ –ø—Ä–æ–¥—É–∫—Ç–∞
            # plt.figure(figsize=(10, 7))
            # plt.title(f"–î–µ–Ω–¥—Ä–æ–≥—Ä–∞–º–º–∞ –¥–ª—è –ø—Ä–æ–¥—É–∫—Ç–∞: {product}")
            # dendrogram(linkage_matrix)
            # plt.show()
        
            # –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤ –¥–æ —Ç–µ—Ö –ø–æ—Ä, –ø–æ–∫–∞ –∏—Ö —Ä–∞–∑–º–µ—Ä –Ω–µ –±—É–¥–µ—Ç —Ö–æ—Ç—è –±—ã min_size —ç–ª–µ–º–µ–Ω—Ç–æ–≤
            final_clusters = initial_clusters
            # logger.info(f"–ò—Ç–æ–≥–æ–≤–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤ –¥–ª—è –ø—Ä–æ–¥—É–∫—Ç–∞ {product}: {len(np.unique(final_clusters))}")
        
            group = group.copy()
            group['final_cluster'] = final_clusters
            
            # –î–æ–±–∞–≤–ª—è–µ–º –Ω–æ–≤—ã–π —Å—Ç–æ–ª–±–µ—Ü —Å–æ —Å—Ä–µ–¥–Ω–∏–º —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ–º –≤–Ω—É—Ç—Ä–∏ –∫–ª–∞—Å—Ç–µ—Ä–∞
            group['mean_distance'] = group['final_cluster'].apply(lambda cluster_id: compute_mean_distance(np.where(final_clusters == cluster_id)[0], embeddings))
        
        grouped_clusters = group.groupby('final_cluster').agg({
            'category': 'first',
            'product': 'first',
            'review_rating': 'mean',
            'url': 'first',
            'sentences': lambda x: list(x),
            'mean_distance': 'first',  # –î–æ–±–∞–≤–ª—è–µ–º —Å—Ä–µ–¥–Ω–µ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –≤ –∏—Ç–æ–≥–æ–≤—ã–π DataFrame
        }).reset_index(drop=True)
                
        grouped_clusters['elem_count'] = grouped_clusters['sentences'].apply(lambda x: len(x))  
        grouped_clusters['word_count'] = grouped_clusters['sentences'].apply(count_words)        
        grouped_clusters['key_thought'] = grouped_clusters['sentences'].apply(extract_key_thought)
        grouped_clusters = grouped_clusters.sort_values(by='word_count', ascending=False)
        
        final_clusters_list.append(grouped_clusters)
    
    # logger.info("–ö–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è –∑–∞–≤–µ—Ä—à–µ–Ω–∞.")
    return pd.concat(final_clusters_list, ignore_index=True)

# –ü—Ä–∏–º–µ–Ω—è–µ–º –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—é –∫–æ –≤—Å–µ–º—É DataFrame
final_df = cluster_product_reviews(df_filtered_reviews, model, tokenizer, batch_size=32, max_distance=0.5, min_cluster_size=5)

# –î–ª—è –≤—ã–≤–æ–¥–∞ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞
# logger.info(f"–ò—Ç–æ–≥–æ–≤—ã–π DataFrame —Å–æ–¥–µ—Ä–∂–∏—Ç {len(final_df)} –∑–∞–ø–∏—Å–µ–π.")
final_df.sort_values("word_count", ascending=False)

Processing products:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                                  | 11/16 [01:19<00:36,  7.23s/it]


KeyboardInterrupt: 

In [None]:
final_df[final_df["elem_count"] == 1]

Unnamed: 0,category,product,review_rating,url,sentences,mean_distance,elem_count,word_count,key_thought
0,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Autobrand_AED / –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è led —Ñ–∞—Ä–∞ 30w —Å ...,2.0,https://www.wildberries.ru/catalog/90401367/fe...,"[–ù–æ —á–µ—Ä–µ–∑ –º–µ—Å—è—Ü —Å–≥–æ—Ä–µ–ª –æ–¥–∏–Ω –¥–∏–æ–¥, –≤—Ç–æ—Ä–æ–π –µ—â—ë —á...",0.0,1,13,"–ù–æ —á–µ—Ä–µ–∑ –º–µ—Å—è—Ü —Å–≥–æ—Ä–µ–ª –æ–¥–∏–Ω –¥–∏–æ–¥, –≤—Ç–æ—Ä–æ–π –µ—â—ë —á–µ..."
7,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.0,https://www.wildberries.ru/catalog/124191551/f...,[–í –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω –ö–æ—Å—Ç–∞–Ω–∞–π —Å–∫—É—é –æ–±–ª–∞—Å—Ç—å –≥–æ—Ä–æ–¥ –õ–∏—Å–∞–∫...,0.0,1,14,–í –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω –ö–æ—Å—Ç–∞–Ω–∞–π —Å–∫—É—é –æ–±–ª–∞—Å—Ç—å –≥–æ—Ä–æ–¥ –õ–∏—Å–∞–∫–æ...
8,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.0,https://www.wildberries.ru/catalog/124191551/f...,[–û—Ç–ª–∏—á–Ω–æ –≤—Å—ë –ø–æ–∫–∞ –Ω–µ —É—Å—Ç–∞–Ω–æ–≤–∏–º –≤—Å–∫—Ä—ã—Ç—å –ø–æ–∫–∞ –Ω–µ...,0.0,1,13,–û—Ç–ª–∏—á–Ω–æ –≤—Å—ë –ø–æ–∫–∞ –Ω–µ —É—Å—Ç–∞–Ω–æ–≤–∏–º –≤—Å–∫—Ä—ã—Ç—å –ø–æ–∫–∞ –Ω–µ ...
9,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.0,https://www.wildberries.ru/catalog/124191551/f...,[–ö–æ—Ä–æ–±–∫–∞ —Å –ª–µ–±–µ–¥–∫–æ–π –ø—Ä–∏—à–ª–∞ –∫–∞–∫ –±—É–¥—Ç–æ –µ—ë —Å–æ–±–∞–∫–∏...,0.0,1,13,–ö–æ—Ä–æ–±–∫–∞ —Å –ª–µ–±–µ–¥–∫–æ–π –ø—Ä–∏—à–ª–∞ –∫–∞–∫ –±—É–¥—Ç–æ –µ—ë —Å–æ–±–∞–∫–∏ ...
10,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.0,https://www.wildberries.ru/catalog/124191551/f...,"[–í—Å–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –≤—Å–µ, —É—Å—Ç—Ä–∞–∏–≤–∞–µ—Ç –µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω–æ–µ —á—Ç–æ...",0.0,1,12,"–í—Å–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –≤—Å–µ, —É—Å—Ç—Ä–∞–∏–≤–∞–µ—Ç –µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω–æ–µ —á—Ç–æ ..."
...,...,...,...,...,...,...,...,...,...
1366,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,5.0,https://www.wildberries.ru/catalog/155565431/f...,[—Ñ–æ—Ç–æ],0.0,1,1,—Ñ–æ—Ç–æ
1367,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,5.0,https://www.wildberries.ru/catalog/155565431/f...,[–≤—ã—Å—Ç—É–ø–∞–ª–∞],0.0,1,1,–≤—ã—Å—Ç—É–ø–∞–ª–∞
1368,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,5.0,https://www.wildberries.ru/catalog/155565431/f...,[–±—ã–ª–∏],0.0,1,1,–±—ã–ª–∏
1369,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,3.0,https://www.wildberries.ru/catalog/155565431/f...,[–µ–µ],0.0,1,1,–µ–µ


In [93]:
final_df[final_df["elem_count"] == 1].info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 1205 entries, 0 to 1370
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   category       1205 non-null   object
 1   product        1205 non-null   object
 2   review_rating  1205 non-null   float64
 3   url            1205 non-null   object
 4   sentences      1205 non-null   list
 5   mean_distance  1205 non-null   float64
 6   elem_count     1205 non-null   int64
 7   word_count     1205 non-null   int64
 8   key_thought    1205 non-null   object
dtypes: float64(2), int64(2), list(1), object(4)
memory usage: 377.9+ KB


In [94]:
final_df_filtered = final_df[final_df["elem_count"] > 2]
final_df_filtered.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 59 entries, 6 to 1301
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   category       59 non-null     object
 1   product        59 non-null     object
 2   review_rating  59 non-null     float64
 3   url            59 non-null     object
 4   sentences      59 non-null     list
 5   mean_distance  59 non-null     float64
 6   elem_count     59 non-null     int64
 7   word_count     59 non-null     int64
 8   key_thought    59 non-null     object
dtypes: float64(2), int64(2), list(1), object(4)
memory usage: 20.9+ KB


In [95]:
# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∫ –∏—Ç–æ–≥–æ–≤–æ–º—É DataFrame
final_df_filtered = apply_key_thought_extraction(final_df_filtered)

[32m2024-08-23 11:20:25.582[0m | [1mINFO    [0m | [36m__main__[0m:[36mapply_key_thought_extraction[0m:[36m102[0m - [1mextract_key_thought_method_1[0m
[32m2024-08-23 11:20:33.618[0m | [1mINFO    [0m | [36m__main__[0m:[36mapply_key_thought_extraction[0m:[36m104[0m - [1mextract_key_thought_method_2[0m
[32m2024-08-23 11:20:47.266[0m | [1mINFO    [0m | [36m__main__[0m:[36mapply_key_thought_extraction[0m:[36m108[0m - [1mextract_key_thought_method_4[0m


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
final_df_filtered

Unnamed: 0,category,product,review_rating,url,sentences,mean_distance,elem_count,word_count,key_thought,Key_Thought_Method_1,Key_Thought_Method_2
2,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.0,https://www.wildberries.ru/catalog/124191551/f...,"[–ï—Å–ª–∏ –Ω–µ —Ç–æ–ø–∏—Ç—å –µ–µ –≤ –≤–æ–¥–µ –ø–æ–ª–Ω–æ—Å—Ç—å—é, —Ç–æ –≤–ª–∞–≥–∏ ...",0.10614,3,91,–ö–æ—Ä–æ–±–∫–∞ —Å –ª–µ–±–µ–¥–∫–æ–π –ø—Ä–∏—à–ª–∞ –∫–∞–∫ –±—É–¥—Ç–æ –µ—ë —Å–æ–±–∞–∫–∏ ...,–ö–æ—Ä–æ–±–∫–∞ —Å –ª–µ–±–µ–¥–∫–æ–π –ø—Ä–∏—à–ª–∞ –∫–∞–∫ –±—É–¥—Ç–æ –µ—ë —Å–æ–±–∞–∫–∏ ...,–ë—Ä–∞–¥ –¥–æ —ç—Ç–æ–≥–æ –º–µ–Ω—å—à–µ–π –º–æ—â–Ω–æ—Å—Ç—å –Ω–µ–º–Ω–æ–≥–æ –Ω–µ —Ö–≤–∞—Ç...
3,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,3.333333,https://www.wildberries.ru/catalog/124191551/f...,"[–†—ã—á–∞–≥ —Å–≤–æ–±–æ–¥–Ω–æ–≥–æ —Ö–æ–¥–∞ –ª–µ–±–µ–¥–∫–∏ –Ω–µ –≥–µ—Ä–º–µ—Ç–∏—á–Ω—ã–π,...",0.16198,3,40,"–†—ã—á–∞–≥ —Å–≤–æ–±–æ–¥–Ω–æ–≥–æ —Ö–æ–¥–∞ –ª–µ–±–µ–¥–∫–∏ –Ω–µ –≥–µ—Ä–º–µ—Ç–∏—á–Ω—ã–π, ...","–†—ã—á–∞–≥ —Å–≤–æ–±–æ–¥–Ω–æ–≥–æ —Ö–æ–¥–∞ –ª–µ–±–µ–¥–∫–∏ –Ω–µ –≥–µ—Ä–º–µ—Ç–∏—á–Ω—ã–π, ...","–ö—É–¥–∞ —Å–º–æ—Ç—Ä–∏—Ç–µ???–∑–∞–∫–∞–∑—ã–≤–∞–ª–∏ —Å—Ç–∞–ª—å–Ω–æ–π —Ç—Ä–æ—Å, –∞ –ø—Ä..."
58,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,MOTORin / –†–∞—Å—à–∏—Ä–∏—Ç–µ–ª—å –∫–æ–ª—ë—Å–Ω—ã—Ö –∞—Ä–æ–∫ 40 –º–º,3.666667,https://www.wildberries.ru/catalog/149157758/f...,"[–ù–∞ –Ω–∏–≤—É –Ω–æ—Ä–º–∞–ª—å–Ω–æ –Ω–µ –≤—Å—Ç–∞—ë—Ç, –æ–∫–∞–Ω—Ç–æ–≤–∫–∞ –º–∞–ª–µ–Ω—å...",0.139602,3,46,"–°–∞–º–∏ —Ä–∞—Å—à–∏—Ä–∏—Ç–µ–ª–∏ –∞—Ä–æ–∫ –Ω–æ—Ä–º–∞–ª—å–Ω—ã–µ, –Ω–æ —Å—Ç–∞–≤–∏—Ç—å –Ω...","–°–∞–º–∏ —Ä–∞—Å—à–∏—Ä–∏—Ç–µ–ª–∏ –∞—Ä–æ–∫ –Ω–æ—Ä–º–∞–ª—å–Ω—ã–µ, –Ω–æ —Å—Ç–∞–≤–∏—Ç—å –Ω...","–î–ª—è —É—Å—Ç–∞–Ω–æ–≤–∫–∏ —Ç—Ä–µ–±—É–µ—Ç—Å—è —á–∞—Å—Ç–æ–µ —Å–≤–µ—Ä–ª–µ–Ω–∏–µ –∞—Ä–æ–∫,..."
107,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3500lb 15...,3.2,https://www.wildberries.ru/catalog/162316088/f...,[–ü—Ä–æ—Å—Ç–æ –¥–ª—è –Ω–µ–∫–æ—Ç–æ—Ä—ã—Ö —Ç–æ–≤–∞—Ä–∏—â–µ–π —ç—Ç–æ—Ç –ø—Ä–∏–±–æ—Ä –¥–æ...,0.141455,5,100,"–ù–æ—Ä–º–∞–ª—å–Ω—ã–π –ª–µ–±—ë–¥–∫–∞, –Ω–æ —à—É–º–µ–ª, —Ä–∞–∑–æ–±—Ä–∞–ª—Å—è –ø–æ–ª–æ–∂...","–ù–æ—Ä–º–∞–ª—å–Ω—ã–π –ª–µ–±—ë–¥–∫–∞, –Ω–æ —à—É–º–µ–ª, —Ä–∞–∑–æ–±—Ä–∞–ª—Å—è –ø–æ–ª–æ–∂...","–ù–µ–ø–æ–Ω—è—Ç–Ω–∞ —Ä–∞–±–æ—Ç–∞ —Ç–æ—Ä–º–æ–∑–∞, –Ω–∞ –≤–∏–¥–µ–æ —Å—Ä–∞–±–æ—Ç–∞–ª 2 ..."
108,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3500lb 15...,5.0,https://www.wildberries.ru/catalog/162316088/f...,"[–•–æ—Ä–æ—à–∞—è –ª–µ–±–µ–¥–∫–∞, —Å–æ —Å–≤–æ–µ–π –∑–∞–¥–∞—á–µ–π —Å–ø—Ä–∞–≤–ª—è–µ—Ç—Å—è...",0.114982,3,43,–õ–µ–±—ë–¥–∫—É –∏—Å–ø–æ–ª—å–∑—É—é –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –∏ –æ–ø—É—Å–∫–∞–Ω–∏—è —Å–Ω–µ...,–õ–µ–±—ë–¥–∫—É –∏—Å–ø–æ–ª—å–∑—É—é –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –∏ –æ–ø—É—Å–∫–∞–Ω–∏—è —Å–Ω–µ...,"–•–æ—Ä–æ—à–∞—è –∫–æ–º–ø–∞–∫—Ç–Ω–∞—è –ª–µ–±—ë–¥–∫–∞, –±—Ä–∞–ª –¥–ª—è –∑–∞—Ç–∞—Å–∫–∏–≤–∞..."
131,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 4500lb 20...,3.2,https://www.wildberries.ru/catalog/240093970/f...,[–ü—Ä–æ—Å—Ç–æ –¥–ª—è –Ω–µ–∫–æ—Ç–æ—Ä—ã—Ö —Ç–æ–≤–∞—Ä–∏—â–µ–π —ç—Ç–æ—Ç –ø—Ä–∏–±–æ—Ä –¥–æ...,0.141455,5,100,"–ù–æ—Ä–º–∞–ª—å–Ω—ã–π –ª–µ–±—ë–¥–∫–∞, –Ω–æ —à—É–º–µ–ª, —Ä–∞–∑–æ–±—Ä–∞–ª—Å—è –ø–æ–ª–æ–∂...","–ù–æ—Ä–º–∞–ª—å–Ω—ã–π –ª–µ–±—ë–¥–∫–∞, –Ω–æ —à—É–º–µ–ª, —Ä–∞–∑–æ–±—Ä–∞–ª—Å—è –ø–æ–ª–æ–∂...","–ù–µ–ø–æ–Ω—è—Ç–Ω–∞ —Ä–∞–±–æ—Ç–∞ —Ç–æ—Ä–º–æ–∑–∞, –Ω–∞ –≤–∏–¥–µ–æ —Å—Ä–∞–±–æ—Ç–∞–ª 2 ..."
132,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 4500lb 20...,5.0,https://www.wildberries.ru/catalog/240093970/f...,"[–•–æ—Ä–æ—à–∞—è –ª–µ–±–µ–¥–∫–∞, —Å–æ —Å–≤–æ–µ–π –∑–∞–¥–∞—á–µ–π —Å–ø—Ä–∞–≤–ª—è–µ—Ç—Å—è...",0.114982,3,43,–õ–µ–±—ë–¥–∫—É –∏—Å–ø–æ–ª—å–∑—É—é –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –∏ –æ–ø—É—Å–∫–∞–Ω–∏—è —Å–Ω–µ...,–õ–µ–±—ë–¥–∫—É –∏—Å–ø–æ–ª—å–∑—É—é –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –∏ –æ–ø—É—Å–∫–∞–Ω–∏—è —Å–Ω–µ...,"–•–æ—Ä–æ—à–∞—è –∫–æ–º–ø–∞–∫—Ç–Ω–∞—è –ª–µ–±—ë–¥–∫–∞, –±—Ä–∞–ª –¥–ª—è –∑–∞—Ç–∞—Å–∫–∏–≤–∞..."
160,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,4.428571,https://www.wildberries.ru/catalog/48839109/fe...,"[–•–æ—Ä–æ—à–∏ –∫–æ–≥–¥–∞ –Ω—É–∂–Ω–æ ""–≤—ã—Å–∫–æ—á–∏—Ç—å"" –∏–∑ —Å–Ω–µ–∂–Ω–æ–≥–æ –º–µ...",0.093537,7,144,"–•–æ—Ä–æ—à–∏ –∫–æ–≥–¥–∞ –Ω—É–∂–Ω–æ ""–≤—ã—Å–∫–æ—á–∏—Ç—å"" –∏–∑ —Å–Ω–µ–∂–Ω–æ–≥–æ –º–µ—Å...","–•–æ—Ä–æ—à–∏ –∫–æ–≥–¥–∞ –Ω—É–∂–Ω–æ ""–≤—ã—Å–∫–æ—á–∏—Ç—å"" –∏–∑ —Å–Ω–µ–∂–Ω–æ–≥–æ –º–µ—Å...","–í —Å–Ω–µ–≥–æ–ø–∞–¥—ã —Å–ø–∞—Å–∞—é—Ç, —á—Ç–æ–±—ã –≤—ã–µ—Ö–∞—Ç—å —Å –∑–∞—Å–Ω–µ–∂–µ–Ω–Ω..."
161,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,4.0,https://www.wildberries.ru/catalog/48839109/fe...,"[–ú–æ–∂–Ω–æ –±—ã–ª–æ –æ—Ç–ª–∏–≤ —Å–Ω–∏–∑—É —Å–¥–µ–ª–∞—Ç—å, —á—Ç–æ–±—ã –Ω–µ –≤—ã–ø–∞...",0.127342,11,135,–•–æ—Ç–µ–ª –≤—ã—Ç–∞—â–∏—Ç—å –∫–∞—Ç–µ—Ä –∏–∑ –≤–æ–¥—ã –Ω–∞ –ø–µ—Ä–µ–¥–Ω–µ–º –ø—Ä–∏–≤–æ...,–•–æ—Ç–µ–ª –≤—ã—Ç–∞—â–∏—Ç—å –∫–∞—Ç–µ—Ä –∏–∑ –≤–æ–¥—ã –Ω–∞ –ø–µ—Ä–µ–¥–Ω–µ–º –ø—Ä–∏–≤–æ...,"–ï—â—ë –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª, –Ω–æ —è –¥—É–º–∞—é –Ω–∞ –º–æ—Ä–æ–∑–µ –ø–ª–∞—Å—Ç–∏–∫ ..."
162,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,4.0,https://www.wildberries.ru/catalog/48839109/fe...,"[–ù–µ –≤–æ –≤—Å–µ—Ö —Å–ª—É—á–∞—è—Ö, –∫–æ–Ω–µ—á–Ω–æ, —ç—Ç–∏ –ê–Ω—Ç–∏ –±—É–∫—Å—ã –º...",0.117228,7,131,–ó–∏–º–Ω—è—è —Ä–µ–∑–∏–Ω–∞ –ø–µ—Ä–µ–∂–µ–≤—ã–≤–∞–µ—Ç —ç—Ç–∏ –ø–ª–∞—Å—Ç–∏–∫–æ–≤—ã–µ –∞–Ω—Ç...,–ó–∏–º–Ω—è—è —Ä–µ–∑–∏–Ω–∞ –ø–µ—Ä–µ–∂–µ–≤—ã–≤–∞–µ—Ç —ç—Ç–∏ –ø–ª–∞—Å—Ç–∏–∫–æ–≤—ã–µ –∞–Ω—Ç...,"–ë—ã–ª–æ –±—ã —á—É—Ç—å –ª—É—á—à–µ, –µ—Å–ª–∏ –±—ã —à–∏–ø—ã –Ω–∞ –ø–ª–∞—Å—Ç–∏–Ω–∞—Ö ..."


In [None]:
final_df_filtered

In [None]:
final_df.sort_values("word_count", ascending=False)

Unnamed: 0,category,product,review_rating,url,sentences,mean_distance,elem_count,word_count,key_thought
394,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–ü–ö –õ–ò–ú / –ë—Ä–∞—Å–ª–µ—Ç—ã —Ü–µ–ø–∏ –ø—Ä–æ—Ç–∏–≤–æ—Å–∫–æ–ª—å–∂–µ–Ω–∏—è,3.928571,https://www.wildberries.ru/catalog/143132420/f...,[–£–∂–µ –ø–æ–ª—å–∑–æ–≤–∞–ª—Å—è –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ä–∞–∑))) –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é)...,0.099308,14,315,"–ü—Ä–æ—Ç—è–≥–∏–≤–∞—Ç—å –ª–µ–Ω—Ç—É —Å —Ç–æ–π —Å—Ç–æ—Ä–æ–Ω—ã –∫–æ–ª–µ—Å–∞, –æ–¥—É—Ä–µ–≤..."
464,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–ü–ö –õ–ò–ú / –¶–µ–ø–∏ –ø—Ä–æ—Ç–∏–≤–æ—Å–∫–æ–ª—å–∂–µ–Ω–∏—è –¥–ª—è –ª–µ–≥–∫–æ–≤—ã—Ö –∞...,3.928571,https://www.wildberries.ru/catalog/143132161/f...,[–£–∂–µ –ø–æ–ª—å–∑–æ–≤–∞–ª—Å—è –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ä–∞–∑))) –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é)...,0.099308,14,315,"–ü—Ä–æ—Ç—è–≥–∏–≤–∞—Ç—å –ª–µ–Ω—Ç—É —Å —Ç–æ–π —Å—Ç–æ—Ä–æ–Ω—ã –∫–æ–ª–µ—Å–∞, –æ–¥—É—Ä–µ–≤..."
304,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,3.571429,https://www.wildberries.ru/catalog/62827233/fe...,[–ó–ê–°–¢–†–Ø–õ–ê!–î–£–ú–ê–õ–ê –£–ñ–ï –í–ï–°–ù–´ –ñ–î–ê–¢–¨üôà–ú–ê–®–ò–ù–ê –°–¢–Ø–õ–ê ...,0.110913,7,198,"–•–æ—Ç—å –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª–∏ –ø–∏—à–µ—Ç, —á—Ç–æ –±—É–∫—Å–æ–≤–∞—Ç—å –Ω–µ–ª—å–∑—è..."
647,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,4.777778,https://www.wildberries.ru/catalog/155565431/f...,[–†–µ–∞–ª—å–Ω–æ —à—Ç—É–∫–∞ —Ä–∞–±–æ—Ç–∞–µ—Ç –Ω–∞–º–∞–∑–∞—Ç—å –Ω–∞ —Ä–∂–∞–≤—á–∏–Ω—É —Å...,0.123763,9,179,"–ü–æ–ø—Ä–æ–±–æ–≤–∞–ª –Ω–∞ –º–∞—à–∏–Ω–µ —Ä–∂–∞ –¥–æ –¥—ã—Ä—ã –Ω–∞ –¥–≤–µ—Ä–∫–µ, –ø–æ..."
160,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,4.428571,https://www.wildberries.ru/catalog/48839109/fe...,"[–•–æ—Ä–æ—à–∏ –∫–æ–≥–¥–∞ –Ω—É–∂–Ω–æ ""–≤—ã—Å–∫–æ—á–∏—Ç—å"" –∏–∑ —Å–Ω–µ–∂–Ω–æ–≥–æ –º–µ...",0.093537,7,144,"–•–æ—Ä–æ—à–∏ –∫–æ–≥–¥–∞ –Ω—É–∂–Ω–æ ""–≤—ã—Å–∫–æ—á–∏—Ç—å"" –∏–∑ —Å–Ω–µ–∂–Ω–æ–≥–æ –º–µ—Å..."
...,...,...,...,...,...,...,...,...,...
646,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,5.000000,https://www.wildberries.ru/catalog/197410221/f...,[.],0.000000,1,1,.
771,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,5.000000,https://www.wildberries.ru/catalog/155565431/f...,[–ù–∞–Ω—ë—Å],0.000000,1,1,–ù–∞–Ω—ë—Å
772,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,1.000000,https://www.wildberries.ru/catalog/155565431/f...,[–ì–æ.],0.000000,1,1,–ì–æ.
773,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,1.000000,https://www.wildberries.ru/catalog/155565431/f...,[–ù–æ.],0.000000,1,1,–ù–æ.


In [None]:
final_df.sort_values("product", ascending=False)

Unnamed: 0,category,product,review_rating,url,sentences,mean_distance,elem_count,word_count,key_thought
775,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–§—Ä–µ–≥–∞—Ç –õ–∏—Ñ—Ç –ü–æ–¥–≤–µ—Å–∫–∞ / –õ–∏—Ñ—Ç –∫–æ–º–ø–ª–µ–∫—Ç —Ä–µ—Å—Å–æ—Ä—ã –ö...,4.000000,https://www.wildberries.ru/catalog/111627554/f...,[–í—Å—Ç–∞–ª–∏ –∫–∞–∫ —Ä–æ–¥–Ω—ã–µ —É—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–ª –Ω–∞ –ø–µ—Ç—Ä–æ –ø–∏–∫–∞–ø...,0.000000,2,19,"–ù–∞ –≤–∏–¥ –Ω–µ –ø–ª–æ—Ö–∏–µ, –Ω–æ –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–∞ —É–∞–∑ –ø—Ä–æ—Ñ–∏ ..."
776,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–§—Ä–µ–≥–∞—Ç –õ–∏—Ñ—Ç –ü–æ–¥–≤–µ—Å–∫–∞ / –õ–∏—Ñ—Ç –∫–æ–º–ø–ª–µ–∫—Ç —Ä–µ—Å—Å–æ—Ä—ã –ö...,5.000000,https://www.wildberries.ru/catalog/111627554/f...,"[–ü—Ä–∏—à–ª–∏ –∫–∞—á–µ—Å—Ç–≤–æ üî•–ø–æ–∫–∞ –µ—â—ë –Ω–µ —Å—Ç–∞–≤–∏–ª, –í—Å–µ –≤—Å—Ç–∞...",0.000000,2,19,–ü—Ä–∏—à–ª–∏ –∫–∞—á–µ—Å—Ç–≤–æ üî•–ø–æ–∫–∞ –µ—â—ë –Ω–µ —Å—Ç–∞–≤–∏–ª
647,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,4.777778,https://www.wildberries.ru/catalog/155565431/f...,[–†–µ–∞–ª—å–Ω–æ —à—Ç—É–∫–∞ —Ä–∞–±–æ—Ç–∞–µ—Ç –Ω–∞–º–∞–∑–∞—Ç—å –Ω–∞ —Ä–∂–∞–≤—á–∏–Ω—É —Å...,0.123763,9,179,"–ü–æ–ø—Ä–æ–±–æ–≤–∞–ª –Ω–∞ –º–∞—à–∏–Ω–µ —Ä–∂–∞ –¥–æ –¥—ã—Ä—ã –Ω–∞ –¥–≤–µ—Ä–∫–µ, –ø–æ..."
648,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,4.333333,https://www.wildberries.ru/catalog/155565431/f...,"[–°–∞–º —Ä–∞—Å—Ç–≤–æ—Ä –º–Ω–µ –ø–æ–Ω—Ä–∞–≤–∏–ª—Å—è, –≤—ã—Å—Ç—É–ø–∞–ª–∞ —Ä–∂–∞–≤—á–∏–Ω...",0.119070,6,110,"–î–∞ –∏ –Ω–∞ –∂–∞—Ä–µ —Å—Ç–æ—è–ª–∞ –º–∞—à–∏–Ω–∞, –≥–¥–µ –±—ã–ª–∏ –ø—è—Ç–Ω–∞ —Ä–∂–∞..."
649,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,5.000000,https://www.wildberries.ru/catalog/155565431/f...,"[–ì–µ–ª—å —Å—É–ø–µ—Ä, –æ—á–µ–Ω—å –ø–æ—Ä–∞–¥–æ–≤–∞–ª, –Ω–µ —Å—Ç–µ–∫–∞–µ—Ç, –º–æ–∂–Ω...",0.137930,4,74,–ü—Ä–µ—à—ë–ª –ø–æ –≥—Ä–∞—Ñ–∏–∫—É —Ä–∞–±–æ—Ç–∞–µ—Ç —Å—É–ø–µ—Ä –¥–æ —ç—Ç–æ–≥–æ –ø—Ä–æ–±...
...,...,...,...,...,...,...,...,...,...
53,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.000000,https://www.wildberries.ru/catalog/124191551/f...,[–ù–µ –ø—Ä–æ–≤–µ—Ä—è–ª],0.000000,1,2,–ù–µ –ø—Ä–æ–≤–µ—Ä—è–ª
54,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.000000,https://www.wildberries.ru/catalog/124191551/f...,[–£—Å—Ç–∞–Ω–æ–≤–ª—é],0.000000,1,1,–£—Å—Ç–∞–Ω–æ–≤–ª—é
55,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Hangkai / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –≤–ª–∞–≥–æ–∑–∞—â–∏—Ç–Ω–∞—è ...,5.000000,https://www.wildberries.ru/catalog/124191551/f...,[be/X6alkqqmbiQ?si=_dV1lFSghhipjst6],0.000000,1,1,be/X6alkqqmbiQ?si=_dV1lFSghhipjst6
0,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,Autobrand_AED / –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è led —Ñ–∞—Ä–∞ 30w —Å ...,2.000000,https://www.wildberries.ru/catalog/90401367/fe...,"[–ù–æ —á–µ—Ä–µ–∑ –º–µ—Å—è—Ü —Å–≥–æ—Ä–µ–ª –æ–¥–∏–Ω –¥–∏–æ–¥, –≤—Ç–æ—Ä–æ–π –µ—â—ë —á...",0.000000,1,13,"–ù–æ —á–µ—Ä–µ–∑ –º–µ—Å—è—Ü —Å–≥–æ—Ä–µ–ª –æ–¥–∏–Ω –¥–∏–æ–¥, –≤—Ç–æ—Ä–æ–π –µ—â—ë —á–µ..."


In [None]:
# –£–¥–∞–ª–µ–Ω–∏–µ –∑–∞–ø–∏—Å–µ–π —Å word_count <= 10 –∏ –∫–ª—é—á–µ–≤–æ–π –º—ã—Å–ª—å—é –º–µ–Ω–µ–µ 3 —Å–∏–º–≤–æ–ª–æ–≤
final_result = final_result[((final_result['word_count'] > 10) & (final_result['key_thought'].str.len() > 5))]
final_result

NameError: name 'final_result' is not defined

In [None]:
final_result.to_csv("./reviews_keywords/feedbackfueltest.csv")