In [4]:
import os
import pandas as pd
from tqdm import tqdm
import torch
import pyarrow.parquet as pq
import dask.dataframe as dd
import spacy
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast, BertForSequenceClassification, BertConfig
from sklearn.cluster import DBSCAN
import numpy as np
from collections import Counter
from torch.utils.data import DataLoader, Dataset as TorchDataset
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import hdbscan
from scipy.spatial.distance import pdist, squareform
import logging
import re
from joblib import Parallel, delayed


class ReviewsKeywords:
    def __init__(self, csv_path, model_path, spacy_model="ru_core_news_lg"):
        self.csv_path = csv_path
        self.model_path = model_path

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self.device == "cuda":
            import cudf.pandas  # –ò–º–ø–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ cuDF –∏ –∞–∫—Ç–∏–≤–∞—Ü–∏—è –µ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
            cudf.pandas.install()
        os.environ["TOKENIZERS_PARALLELISM"] = "true"  # –í–∫–ª—é—á–∞–µ–º –ø–∞—Ä–∞–ª–ª–µ–ª–∏–∑–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
        self.tokenizer_my = BertTokenizerFast.from_pretrained(self.model_path)
         # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏
        self.classification_model = BertForSequenceClassification.from_pretrained(self.model_path).to(self.device)
        # –ó–∞–≥—Ä—É–∑–∫–∞ –±–∞–∑–æ–≤–æ–π –º–æ–¥–µ–ª–∏ –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        self.embedding_model = AutoModel.from_pretrained(self.model_path).to(self.device)
        
        # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞ –æ—Ç –°–±–µ—Ä–±–∞–Ω–∫–∞
        self.tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/sbert_large_nlu_ru')
        self.embedding_model = AutoModel.from_pretrained('sberbank-ai/sbert_large_nlu_ru').to(self.device)
        
        spacy.prefer_gpu()
        self.nlp = spacy.load(spacy_model, disable=["ner", "tagger", "attribute_ruler", "lemmatizer"])
        
        self.df = pd.read_csv(self.csv_path, nrows=1000)

    @staticmethod
    def clean_text(text):
        text = re.sub(r'[\n\r\t]+|\s{2,}', ' ', text)
        text = re.sub(r'(?<!\.)\s*\.\s*|\s*\.\s*(?!\.)', '. ', text)
        return text.strip().rstrip('.')

    def split_reviews_into_sentences(self, batch):
        cleaned_texts = [self.clean_text(text) for text in batch['corrected_text']]
        docs = list(self.nlp.pipe(cleaned_texts, batch_size=64))
        batch['sentences'] = [[sent.text for sent in doc.sents] for doc in docs]
        return batch

    def process_reviews(self):
        dataset = Dataset.from_pandas(self.df)
        dataset = dataset.map(self.split_reviews_into_sentences, batched=True, batch_size=32)
        self.df = dataset.to_pandas()
        df_exploded = self.df.explode('sentences').reset_index(drop=True)
        df_exploded = df_exploded.drop(columns=[col for col in df_exploded.columns if col.startswith('__index_level_')])
        return Dataset.from_pandas(df_exploded)

    def compute_sentence_embeddings(self, sentences):
        sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
        if not sentences:
            raise ValueError("Input contains no valid strings.")
        inputs = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.embedding_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    def compute_embeddings_after_explode(self, batch):
        sentences = batch['sentences']
        valid_sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
        if not valid_sentences:
            batch['sentence_embeddings'] = [[]] * len(sentences)
            return batch
        embeddings = self.compute_sentence_embeddings(valid_sentences)
        embeddings = embeddings.astype(np.float32)
        final_embeddings = []
        embed_idx = 0
        for sentence in sentences:
            if isinstance(sentence, str):
                final_embeddings.append(embeddings[embed_idx])
                embed_idx += 1
            else:
                final_embeddings.append(np.zeros(embeddings.shape[1], dtype=np.float32))
        batch['sentence_embeddings'] = final_embeddings
        return batch

    def apply_embeddings(self, dataset_exploded):
        return dataset_exploded.map(self.compute_embeddings_after_explode, batched=True, batch_size=128)

    def extract_key_thought(self, cluster_sentences):
        sentences = cluster_sentences.split(" | ")
        embeddings = self.compute_sentence_embeddings(sentences)
        centroid = np.mean(embeddings, axis=0)
        similarities = cosine_similarity(embeddings, [centroid])
        key_sentence_index = np.argmax(similarities)
        return sentences[key_sentence_index]

    def count_words(self, cluster_sentences):
        words = cluster_sentences.split()
        return len(words)

    def recluster_large_cluster(self, cluster_sentences, eps=0.1, min_samples=2):
        sentences = cluster_sentences.split(" | ")
        embeddings = self.compute_sentence_embeddings(sentences)
        re_clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(embeddings)
        re_cluster_dict = {}
        for idx, label in enumerate(re_clustering.labels_):
            if label == -1:
                continue
            label_str = str(label)
            if label_str not in re_cluster_dict:
                re_cluster_dict[label_str] = []
            re_cluster_dict[label_str].append(sentences[idx])
        return [" | ".join(cluster) for cluster in re_cluster_dict.values()]

    def recursive_clustering(self, cluster_sentences, threshold, eps=0.22, min_samples=3, min_eps=0.02):
        current_eps = eps
        current_min_samples = min_samples
        new_clusters = [cluster_sentences]
        while True:
            next_clusters = []
            reclustered_any = False
            for cluster in new_clusters:
                if self.count_words(cluster) > threshold:
                    while current_eps >= min_eps:
                        reclustered = self.recluster_large_cluster(cluster, eps=current_eps, min_samples=current_min_samples)
                        if len(reclustered) > 1:
                            next_clusters.extend(reclustered)
                            reclustered_any = True
                            break
                        else:
                            if current_eps > min_eps:
                                current_eps -= 0.05
                    if len(reclustered) == 1:
                        next_clusters.append(cluster)
                else:
                    next_clusters.append(cluster)
            new_clusters = next_clusters
            if not reclustered_any:
                break
        return new_clusters

    def generate_predictions(self, dataset_exploded):
        tokenizer = self.tokenizer_my
        model = self.classification_model
        if self.device == torch.device("cuda"):
            model = model.half()

        reviews = dataset_exploded["sentences"]
        reviews = [str(review) for review in reviews if isinstance(review, str) and review.strip()]

        class ReviewDataset(TorchDataset):
            def __init__(self, reviews, tokenizer, max_len=128):
                self.reviews = reviews
                self.tokenizer = tokenizer
                self.max_len = max_len

            def __len__(self):
                return len(self.reviews)

            def __getitem__(self, idx):
                review = self.reviews[idx]
                encoding = self.tokenizer.encode_plus(
                    review,
                    add_special_tokens=True,
                    max_length=self.max_len,
                    return_token_type_ids=False,
                    padding='max_length',
                    truncation=True,
                    return_attention_mask=True,
                    return_tensors='pt'
                )
                return {key: val.flatten() for key, val in encoding.items()}

        dataset = ReviewDataset(reviews, tokenizer)
        batch_size = 32
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

        predictions = []

        from torch.cuda.amp import autocast

        for batch in tqdm(dataloader, desc="–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤"):
            batch = {key: val.to(self.device) for key, val in batch.items()}
            
            with torch.no_grad():
                with autocast():  # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Å–º–µ—à–∞–Ω–Ω—É—é —Ç–æ—á–Ω–æ—Å—Ç—å
                    outputs = model(**batch)
                    logits = outputs[0] if isinstance(outputs, tuple) else outputs.logits
                    probabilities = torch.softmax(logits, dim=-1)
                    batch_predictions = (probabilities[:, 1] > 0.7).cpu().numpy()  # –ò—Å–ø–æ–ª—å–∑—É–µ–º –ø–æ—Ä–æ–≥ 0.7
                    predictions.extend(batch_predictions)

        if len(predictions) != len(dataset_exploded):
            print(f"Warning: Length of predictions ({len(predictions)}) does not match length of index ({len(dataset_exploded)})")
            if len(predictions) < len(dataset_exploded):
                missing_count = len(dataset_exploded) - len(predictions)
                predictions.extend([0] * missing_count)
            elif len(predictions) > len(dataset_exploded):
                predictions = predictions[:len(dataset_exploded)]
        dataset_exploded = dataset_exploded.add_column("predictions", predictions)
        return dataset_exploded

    def process_group(self, category_name, product_name, group):
        all_sentences = group['sentences'].tolist()
        if not all_sentences:
            return pd.DataFrame()

        try:
            all_embeddings = self.compute_sentence_embeddings(all_sentences)
        except ValueError as e:
            print(f"Error in computing embeddings for product {product_name}: {e}")
            return pd.DataFrame()

        distance_matrix = squareform(pdist(all_embeddings, metric='cosine'))
        clustering = hdbscan.HDBSCAN(min_samples=3, metric='precomputed').fit(distance_matrix)

        cluster_dict = {}
        for idx, label in enumerate(clustering.labels_):
            if label == -1:
                continue
            label_str = str(label)
            if label_str not in cluster_dict:
                cluster_dict[label_str] = set()
            cluster_dict[label_str].add(all_sentences[idx])

        clusters = [" | ".join(sentences) for sentences in cluster_dict.values()]

        if not clusters:
            return pd.DataFrame()

        group['binary_rating'] = group['review_rating'].apply(lambda x: 1 if x in [4, 5] else 0)
        avg_rating = group['binary_rating'].mean()
        rating_category = 'positive' if avg_rating > 0.7 else 'neutral'
        rating_category = 'neutral' if avg_rating > 0.5 else 'negative'

        threshold = self.determine_threshold(clusters)

        final_clusters = []
        for cluster in clusters:
            if self.count_words(cluster) > threshold:
                final_clusters.extend(self.recursive_clustering(cluster, threshold))
            else:
                final_clusters.append(cluster)

        # –û–±–µ—Å–ø–µ—á–µ–Ω–∏–µ –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–≥–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
        final_clusters = self.ensure_minimum_clusters(final_clusters, threshold)

        df_exploded_sorted = pd.DataFrame({
            'category': category_name,
            'product': product_name,
            'avg_rating': avg_rating,
            'rating_category': rating_category,
            'cluster_sentences': final_clusters
        })
        df_exploded_sorted['word_count'] = df_exploded_sorted['cluster_sentences'].apply(self.count_words)
        df_exploded_sorted['key_thought'] = df_exploded_sorted['cluster_sentences'].apply(self.extract_key_thought)
        df_exploded_sorted = df_exploded_sorted.sort_values(by='word_count', ascending=False)

        return df_exploded_sorted

    def determine_threshold(self, clusters):
        if len(clusters) == 1:
            cluster_word_count = self.count_words(clusters[0])
            if cluster_word_count > 20:
                return cluster_word_count / 2
            return cluster_word_count
        return np.min([np.mean([self.count_words(cluster) for cluster in clusters]) * 1.5, 250])

    def ensure_minimum_clusters(self, final_clusters, threshold):
        while len(final_clusters) < 3 and any(self.count_words(cluster) > threshold for cluster in final_clusters):
            largest_cluster = max(final_clusters, key=self.count_words)
            final_clusters.remove(largest_cluster)
            new_clusters = self.recursive_clustering(largest_cluster, threshold)
            if len(new_clusters) <= 1:
                final_clusters.append(largest_cluster)
                break
            final_clusters.extend(new_clusters)
        return final_clusters
    
    def cluster_reviews(self, dataset_exploded):
        # –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
        dataset_filtered = dataset_exploded.filter(lambda x: x['predictions'] == 1)
        
        # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ pandas DataFrame –¥–ª—è –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∏
        df_filtered = dataset_filtered.to_pandas()
        grouped = df_filtered.groupby(['category', 'product'])

        results = []
        
        # –ü–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ –±–µ–∑ –ø–∞—Ä–∞–ª–ª–µ–ª–∏–∑–º–∞
        for (category_name, product_name), group in tqdm(grouped, desc="Processing categories and products"):
            result_df = self.process_group(category_name, product_name, group)
            if not result_df.empty:
                results.append(result_df)

        if results:  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ —Å–ø–∏—Å–æ–∫ results –Ω–µ –ø—É—Å—Ç
            final_result = pd.concat(results, ignore_index=True)
            final_result = final_result[((final_result['word_count'] > 10) & (final_result['key_thought'].str.len() > 5))]
            final_result.to_csv("./reviews_keywords/feedbackfueltest.csv")
        else:
            print("No valid results to concatenate. Returning an empty DataFrame.")
            final_result = pd.DataFrame()  # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç–æ–π DataFrame, –µ—Å–ª–∏ –Ω–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏—è
        
        return final_result

    def run(self):
        dataset_exploded = self.process_reviews()
        dataset_exploded = self.apply_embeddings(dataset_exploded)
        dataset_exploded = self.generate_predictions(dataset_exploded)
        result = self.cluster_reviews(dataset_exploded)
        return result


reviews_keywords = ReviewsKeywords(csv_path="./reviews_keywords/wildberries_reviews.csv",
                                    model_path='./reviews_keywords/fine_tuned_model')
final_result = reviews_keywords.run()
final_result.head()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2061 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  with autocast():  # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Å–º–µ—à–∞–Ω–Ω—É—é —Ç–æ—á–Ω–æ—Å—Ç—å
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 65/65 [00:12<00:00,  5.09it/s]


Filter:   0%|          | 0/2061 [00:00<?, ? examples/s]

Processing categories and products: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:18<00:00,  1.15s/it]


Unnamed: 0,category,product,avg_rating,rating_category,cluster_sentences,word_count,key_thought
0,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,"–ü–µ—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–µ—Å–æ –∑–∞–∫—Ä—ã–ª–æ—Å—å –≤ —Å–Ω–µ–≥—É, –ø–æ–¥–ª–æ–∂–∏–ª–∏ –ø...",40,"–ü–µ—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–µ—Å–æ –∑–∞–∫—Ä—ã–ª–æ—Å—å –≤ —Å–Ω–µ–≥—É, –ø–æ–¥–ª–æ–∂–∏–ª–∏ –ø..."
1,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ. | –í—Ä–æ–¥–µ –ø—Ä–æ—á–Ω—ã–µ. | –ù–∞ –≤–∏–¥ –ø—Ä–æ—á...,12,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ.
3,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,0.842975,neutral,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª | –í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞....,37,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª
4,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,"–†–µ–∫–æ–º–µ–Ω–¥—É—é, –±—É–¥—É –±—Ä–∞—Ç—å –µ—â–µ | –ó–∞–∫–∞–∂—É | –º—ã–ú—ã–æ—á–Ω–æ...",20,–ë—É–¥—É –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –µ—â—ë.
5,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,–ï–ª–µ –ø–∞—Ö–Ω–µ—Ç. | –û–Ω –¥–∞–∂–µ –Ω–µ –ø–∞—Ö–Ω–µ—Ç. | –ü–∞—Ö–Ω–µ—Ç –∫–∞–∫–∏...,13,–ï–ª–µ –ø–∞—Ö–Ω–µ—Ç.


In [5]:
final_result

Unnamed: 0,category,product,avg_rating,rating_category,cluster_sentences,word_count,key_thought
0,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,"–ü–µ—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–µ—Å–æ –∑–∞–∫—Ä—ã–ª–æ—Å—å –≤ —Å–Ω–µ–≥—É, –ø–æ–¥–ª–æ–∂–∏–ª–∏ –ø...",40,"–ü–µ—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–µ—Å–æ –∑–∞–∫—Ä—ã–ª–æ—Å—å –≤ —Å–Ω–µ–≥—É, –ø–æ–¥–ª–æ–∂–∏–ª–∏ –ø..."
1,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ. | –í—Ä–æ–¥–µ –ø—Ä–æ—á–Ω—ã–µ. | –ù–∞ –≤–∏–¥ –ø—Ä–æ—á...,12,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ.
3,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,0.842975,neutral,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª | –í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞....,37,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª
4,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,"–†–µ–∫–æ–º–µ–Ω–¥—É—é, –±—É–¥—É –±—Ä–∞—Ç—å –µ—â–µ | –ó–∞–∫–∞–∂—É | –º—ã–ú—ã–æ—á–Ω–æ...",20,–ë—É–¥—É –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –µ—â—ë.
5,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,–ï–ª–µ –ø–∞—Ö–Ω–µ—Ç. | –û–Ω –¥–∞–∂–µ –Ω–µ –ø–∞—Ö–Ω–µ—Ç. | –ü–∞—Ö–Ω–µ—Ç –∫–∞–∫–∏...,13,–ï–ª–µ –ø–∞—Ö–Ω–µ—Ç.
6,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,üî•üî•üî•üî•üî•üî• –∑–∞–ø–∞—Ö. | –ó–∞–ø–∞—Ö –æ–≥–æ–Ω—å) | –ó–∞–ø–∞—Ö –æ–≥–æ–Ω—å!!!!...,11,–ó–∞–ø–∞—Ö üî•!!!
7,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,–£–±–∏—Ä–∞–µ—Ç —Ä–∂–∞–≤—á–∏–Ω—É —Ö–æ—Ä–æ—à–æ —á–µ—Ä–µ–∑ 10-20 –º–∏–Ω—É—Ç | –°–æ...,76,–†–∂–∞–≤—á–∏–Ω—É —É–±–∏—Ä–∞–µ—Ç –æ—Ç–ª–∏—á–Ω–æ.
8,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,"–†–∂–∞–≤—á–∏–Ω–∞ —É–∂–µ —Ö–æ—Ä–æ—à–æ –≤—ä–µ–ª–∞—Å—å, –ø—Ä–∏—à–ª–æ—Å—å –Ω–µ—Å–∫–æ–ª—å–∫...",38,"–†–∂–∞–≤—á–∏–Ω–∞ —É–∂–µ —Ö–æ—Ä–æ—à–æ –≤—ä–µ–ª–∞—Å—å, –ø—Ä–∏—à–ª–æ—Å—å –Ω–µ—Å–∫–æ–ª—å–∫..."
9,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,"–§–æ—Ç–æ ¬´–¥–æ¬ª –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ —Å–¥–µ–ª–∞–ª–∞, —Ç–æ–ª—å–∫–æ ¬´–ø–æ—Å–ª...",24,"–§–æ—Ç–æ ¬´–¥–æ¬ª –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ —Å–¥–µ–ª–∞–ª–∞, —Ç–æ–ª—å–∫–æ ¬´–ø–æ—Å–ª–µ¬ª"
10,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,–í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞. | –í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª...,14,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª


In [4]:
final_result

Unnamed: 0,category,product,avg_rating,rating_category,cluster_sentences,word_count,key_thought
0,/–°–ø–æ—Ä—Ç/–°—Ç—Ä–∞–π–∫–±–æ–ª –∏ –ø–µ–π–Ω—Ç–±–æ–ª/–ê–∫—Å–µ—Å—Å—É–∞—Ä—ã,karbi / –†—é–∫–∑–∞–∫ —Ç–∞–∫—Ç–∏—á–µ—Å–∫–∏–π —Ç—É—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∏–π - –∫–∞—Ä...,0.797101,neutral,"–ú–Ω–æ–≥–æ –¥–æ–ø –∫–∞—Ä–º–∞–Ω–æ–≤, —á–µ—Ö–æ–ª –æ—Ç –¥–æ–∂–¥—è, –ø—Ä–æ—Ä–µ–∑–∏–Ω–µ–Ω...",203,"–†—é–∫–∑–∞–∫ –≤–º–µ—Å—Ç–∏—Ç–µ–ª—å–Ω—ã–π, –ø—Ä–æ—á–Ω—ã–π, –µ—Å—Ç—å –∑–∞—â–∏—Ç–Ω—ã–π —á..."
1,/–°–ø–æ—Ä—Ç/–°—Ç—Ä–∞–π–∫–±–æ–ª –∏ –ø–µ–π–Ω—Ç–±–æ–ª/–ê–∫—Å–µ—Å—Å—É–∞—Ä—ã,karbi / –†—é–∫–∑–∞–∫ —Ç–∞–∫—Ç–∏—á–µ—Å–∫–∏–π —Ç—É—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∏–π - –∫–∞—Ä...,0.797101,neutral,"–í –ø–æ–¥–∞—Ä–æ–∫ —à—ë–ª –∫–æ–º–ø–∞—Å,, –Ω–∞–ª–æ–±–Ω—ã–π—Ñ–æ–Ω–∞—Ä—å,, –Ω–æ–∂–∞–Ω–µ...",69,"–í –ø–æ–¥–∞—Ä–æ–∫ –ø–æ–ª–æ–∂–∏–ª–∏ —Ñ–æ–Ω–∞—Ä–∏–∫ –Ω–∞–ª–æ–±–Ω—ã–π, –∫–æ–º–ø–∞—Å –∏ ..."


## –≠—Ç–∞–ø 1

In [1]:
import cudf.pandas  # –ò–º–ø–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ cuDF –∏ –∞–∫—Ç–∏–≤–∞—Ü–∏—è –µ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
cudf.pandas.install()  # –£—Å—Ç–∞–Ω–æ–≤–∫–∞ cuDF –∫–∞–∫ –æ—Å–Ω–æ–≤–Ω–æ–≥–æ –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–∞ –¥–ª—è pandas
import os
import pandas as pd
from tqdm import tqdm
import torch
import pyarrow.parquet as pq
import dask.dataframe as dd

# # –ß—Ç–µ–Ω–∏–µ Parquet-—Ñ–∞–π–ª–∞ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º pyarrow
# table = pq.read_table('./reviews_keywords/wildberries_reviews_corrected.parquet')

# # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ pandas DataFrame
# df_pandas = table.to_pandas()

# # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ pandas DataFrame –≤ Dask DataFrame
# df_dask = dd.from_pandas(df_pandas, npartitions=100)  # –£–∫–∞–∂–∏—Ç–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –Ω—É–∂–Ω—ã—Ö –≤–∞–º —á–∞—Å—Ç–µ–π
# df_pandas = None
# table = None
# import gc
# gc.collect()
# df_dask

In [2]:
result = pd.read_csv("./reviews_keywords/wildberries_reviews.csv", nrows=1000)
result.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Unnamed: 0        1000 non-null   int64
 1   review_full_text  1000 non-null   object
 2   review_rating     1000 non-null   int64
 3   product           1000 non-null   object
 4   category          1000 non-null   object
 5   url               1000 non-null   object
 6   corrected_text    1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 540.9+ KB


In [3]:
# –û—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ –ø–æ 5 –∑–∞–ø–∏—Å–µ–π –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —É–Ω–∏–∫–∞–ª—å–Ω–æ–≥–æ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Å—Ç–æ–ª–±—Ü–µ 'product'
# result_limited = result.groupby('product').apply(lambda x: x.iloc[5:8]).reset_index(drop=True)
result_limited = result


In [4]:
import spacy
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast
import torch
from sklearn.cluster import DBSCAN
import numpy as np
from collections import Counter

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–æ—Å—Ç—É–ø–Ω–æ—Å—Ç–∏ GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
import hdbscan
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–æ–æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
# –ó–∞–≥—Ä—É–∂–∞–µ–º –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é –º–æ–¥–µ–ª–∏


# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞ –æ—Ç –°–±–µ—Ä–±–∞–Ω–∫–∞
tokenizer = BertTokenizerFast.from_pretrained("./reviews_keywords/fine_tuned_model")
model = AutoModel.from_pretrained("./reviews_keywords/fine_tuned_model").to(device)
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å —Å –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–µ–π
config = BertConfig.from_pretrained('./reviews_keywords/fine_tuned_model', output_hidden_states=True)
model_classification = BertForSequenceClassification.from_pretrained('./reviews_keywords/fine_tuned_model', config=config).to(device)

        # self.tokenizer_my = BertTokenizerFast.from_pretrained(self.model_path)
        #  # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏
        # self.classification_model = BertForSequenceClassification.from_pretrained(self.model_path).to(self.device)
        # # –ó–∞–≥—Ä—É–∑–∫–∞ –±–∞–∑–æ–≤–æ–π –º–æ–¥–µ–ª–∏ –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        # self.embedding_model = AutoModel.from_pretrained(self.model_path).to(self.device)

spacy.prefer_gpu()
# –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –Ω–∞—Å—Ç—Ä–æ–π–∫–∞ –º–æ–¥–µ–ª–∏ SpaCy
nlp = spacy.load("ru_core_news_lg", disable=["ner", "tagger", "attribute_ruler", "lemmatizer"])

df = result_limited

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ pandas DataFrame –≤ Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [5]:
import re

def clean_text(text):
    text = re.sub(r'[\n\r\t]+|\s{2,}', ' ', text)  # –û–±—ä–µ–¥–∏–Ω—è–µ–º —à–∞–≥–∏ –¥–ª—è –∑–∞–º–µ–Ω—ã –ø—Ä–æ–±–µ–ª–æ–≤
    text = re.sub(r'(?<!\.)\s*\.\s*|\s*\.\s*(?!\.)', '. ', text)  # –û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è –∑–∞–º–µ–Ω—ã —Ç–æ—á–∫–∏
    return text.strip().rstrip('.')

def split_reviews_into_sentences(batch):
    # –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
    cleaned_texts = [clean_text(text) for text in batch['corrected_text']]
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤ —Å –ø–æ–º–æ—â—å—é nlp.pipe —Å —É–∫–∞–∑–∞–Ω–∏–µ–º batch_size
    docs = list(nlp.pipe(cleaned_texts, batch_size=64))  # –ó–¥–µ—Å—å 64 - –ø—Ä–∏–º–µ—Ä –∑–Ω–∞—á–µ–Ω–∏—è

    # –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π
    batch['sentences'] = [[sent.text for sent in doc.sents] for doc in docs]
    
    return batch

dataset = dataset.map(split_reviews_into_sentences, batched=True, batch_size=32)

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º Dataset –æ–±—Ä–∞—Ç–Ω–æ –≤ pandas DataFrame
df = dataset.to_pandas()

# –í—ã–ø–æ–ª–Ω–∏–º explode –ø–æ —Å—Ç–æ–ª–±—Ü—É —Å –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è–º–∏
df_exploded = df.explode('sentences').reset_index(drop=True)

# –£–¥–∞–ª—è–µ–º –ª–∏—à–Ω–∏–µ —Å—Ç–æ–ª–±—Ü—ã, –∫–æ—Ç–æ—Ä—ã–µ –ø–æ—è–≤–∏–ª–∏—Å—å –ø–æ—Å–ª–µ explode
df_exploded = df_exploded.drop(columns=[col for col in df_exploded.columns if col.startswith('__index_level_')])

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º DataFrame –æ–±—Ä–∞—Ç–Ω–æ –≤ Hugging Face Dataset
dataset_exploded = Dataset.from_pandas(df_exploded)

from torch.cuda.amp import autocast

def compute_sentence_embeddings(sentences):
    # –§–∏–ª—å—Ç—Ä—É–µ–º —Å–ø–∏—Å–æ–∫, –æ—Å—Ç–∞–≤–ª—è—è —Ç–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫–∏
    sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
    
    if not sentences:
        raise ValueError("Input contains no valid strings.")

    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    
    with torch.no_grad():
        with autocast():  # –ò—Å–ø–æ–ª—å–∑—É–µ–º mixed precision –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
            outputs = model_classification(**inputs)
    
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def compute_embeddings_after_explode(batch):
    sentences = batch['sentences']

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –≤—Å–µ —ç–ª–µ–º–µ–Ω—Ç—ã –≤ batch —è–≤–ª—è—é—Ç—Å—è —Å—Ç—Ä–æ–∫–∞–º–∏
    valid_sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
    
    if not valid_sentences:
        batch['sentence_embeddings'] = [[]] * len(sentences)  # –ï—Å–ª–∏ –Ω–µ—Ç –≤–∞–ª–∏–¥–Ω—ã—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π, –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç—ã–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
        return batch

    embeddings = compute_sentence_embeddings(valid_sentences)

    # –ü—Ä–∏–≤–µ–¥–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –∫ —Ç–∏–ø—É float32 –¥–ª—è –∫–æ–Ω—Å–∏—Å—Ç–µ–Ω—Ç–Ω–æ—Å—Ç–∏
    embeddings = embeddings.astype(np.float32)

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π
    if len(embeddings) != len(valid_sentences):
        raise ValueError("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π.")
    
    # –ï—Å–ª–∏ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏ –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –∏—Å—Ö–æ–¥–Ω—ã–º, –∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä—É–µ–º –≤—ã—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ
    final_embeddings = []
    embed_idx = 0
    for sentence in sentences:
        if isinstance(sentence, str):
            final_embeddings.append(embeddings[embed_idx])
            embed_idx += 1
        else:
            final_embeddings.append(np.zeros(embeddings.shape[1], dtype=np.float32))  # –î–æ–±–∞–≤–ª—è–µ–º –Ω—É–ª–µ–≤—ã–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –Ω–µ–≤–∞–ª–∏–¥–Ω—ã—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π

    batch['sentence_embeddings'] = final_embeddings
    return batch

# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —Ñ—É–Ω–∫—Ü–∏–∏
dataset = dataset_exploded.map(compute_embeddings_after_explode, batched=True, batch_size=128)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2061 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification
from tqdm import tqdm
import os
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
import nltk
from nltk.corpus import stopwords
import spacy
from tqdm import tqdm
import logging
import hdbscan  # HDBSCAN –¥–ª—è –±–æ–ª–µ–µ —Å—Ç–∞–±–∏–ª—å–Ω–æ–π –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ —Å –ø–æ–¥–¥–µ—Ä–∂–∫–æ–π –∫–∞—Å—Ç–æ–º–Ω—ã—Ö –º–µ—Ç—Ä–∏–∫
from scipy.spatial.distance import pdist, squareform

In [7]:
# –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ (GPU –∏–ª–∏ CPU)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# –ü–µ—Ä–µ–≤–æ–¥ –º–æ–¥–µ–ª–∏ –≤ —Ä–µ–∂–∏–º FP16, –µ—Å–ª–∏ —ç—Ç–æ –≤–æ–∑–º–æ–∂–Ω–æ
if use_cuda:
    model_classification = model_classification.half()

# –ü—Ä–∏–º–µ—Ä –¥–∞–Ω–Ω—ã—Ö (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ)
reviews = dataset_exploded["sentences"]

# –û—á–∏—Å—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –æ—Ç –Ω–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π
reviews = [str(review) for review in reviews if isinstance(review, str) and review.strip()]

# –°–æ–∑–¥–∞–Ω–∏–µ –∫–∞—Å—Ç–æ–º–Ω–æ–≥–æ Dataset –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ –æ—Ç–∑—ã–≤–æ–≤
class ReviewDataset(Dataset):
    def __init__(self, reviews, tokenizer, max_len=128):
        self.reviews = reviews
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {key: val.flatten() for key, val in encoding.items()}

# –°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç –∏ DataLoader
dataset = ReviewDataset(reviews, tokenizer)
batch_size = 32  # –†–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –º–æ–∂–Ω–æ –∏–∑–º–µ–Ω–∏—Ç—å –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç –æ–±—ä–µ–º–∞ –¥–æ—Å—Ç—É–ø–Ω–æ–π –ø–∞–º—è—Ç–∏ GPU
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# –ü–æ–ª—É—á–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π —Å –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏–µ–º –ø—Ä–æ–≥—Ä–µ—Å—Å–∞
predictions = []

from torch.cuda.amp import autocast  # –ò–º–ø–æ—Ä—Ç–∏—Ä—É–µ–º autocast –¥–ª—è —Å–º–µ—à–∞–Ω–Ω–æ–π —Ç–æ—á–Ω–æ—Å—Ç–∏

for batch in tqdm(dataloader, desc="–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤"):
    batch = {key: val.to(device) for key, val in batch.items()}
    
    with torch.no_grad():
        with autocast():  # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Å–º–µ—à–∞–Ω–Ω—É—é —Ç–æ—á–Ω–æ—Å—Ç—å
            outputs = model_classification(**batch)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            batch_predictions = (probabilities[:, 1] > 0.7).cpu().numpy()  # –ò—Å–ø–æ–ª—å–∑—É–µ–º –ø–æ—Ä–æ–≥ 0.7
            predictions.extend(batch_predictions)

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ DataFrame, –µ—Å–ª–∏ —ç—Ç–æ –µ—â–µ –Ω–µ —Å–¥–µ–ª–∞–Ω–æ
if not isinstance(dataset_exploded, pd.DataFrame):
    dataset_exploded = pd.DataFrame(dataset_exploded)

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞ –Ω–µ—Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏—è –¥–ª–∏–Ω—ã
if len(predictions) != len(dataset_exploded):
    print(f"Warning: Length of predictions ({len(predictions)}) does not match length of index ({len(dataset_exploded)})")
    
    # –ü—Ä–∏–º–µ—Ä: –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ –Ω–µ–¥–æ—Å—Ç–∞—é—â–∏—Ö –∑–Ω–∞—á–µ–Ω–∏–π
    if len(predictions) < len(dataset_exploded):
        missing_count = len(dataset_exploded) - len(predictions)
        predictions.extend([0] * missing_count)  # –î–æ–±–∞–≤–ª—è–µ–º –Ω—É–ª–∏ –≤ —Å–ª—É—á–∞–µ –Ω–µ–¥–æ—Å—Ç–∞—Ç–∫–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π

    elif len(predictions) > len(dataset_exploded):
        predictions = predictions[:len(dataset_exploded)]  # –û–±—Ä–µ–∑–∞–µ–º —Å–ø–∏—Å–æ–∫ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π

# –ü—Ä–∏—Å–æ–µ–¥–∏–Ω–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –∫ –¥–∞—Ç–∞—Å–µ—Ç—É
dataset_exploded['predictions'] = predictions
dataset_exploded.head()



–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤:   0%|                                                                                                                                                                        | 0/65 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if poss

Unnamed: 0.1,Unnamed: 0,review_full_text,review_rating,product,category,url,corrected_text,sentences,__index_level_0__,predictions
0,0,–†–∞–±–æ—Ç–∞–µ—Ç —Ö–æ—Ä–æ—à–æ.,5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,–†–∞–±–æ—Ç–∞–µ—Ç —Ö–æ—Ä–æ—à–æ.,–†–∞–±–æ—Ç–∞–µ—Ç —Ö–æ—Ä–æ—à–æ,0,False
1,1,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...","–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥.",1,False
2,1,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ü—Ä–∏—à–ª–æ –±—ã—Å—Ç—Ä–æ, –≤—Å–µ —Ü–µ–ª–æ–µ –Ω–∞ –≤–∏–¥. –ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏...",–ó–∞–≤—Ç—Ä–∞ –±—É–¥—É –∏—Å–ø—ã—Ç—ã–≤–∞—Ç—å,2,True
3,2,"–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...",5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,"–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...","–ö—É–ø–∏–ª –Ω–∞ –∫–≤–∞–¥—Ä –¥–ª—è –ø–æ–¥–Ω—è—Ç–∏—è –æ—Ç–≤–∞–ª–∞, —É—Å—Ç–∞–Ω–æ–≤–∫–∞ ...",3,True
4,3,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,5,Shtapler / –õ–µ–±–µ–¥–∫–∞ —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è 12v 3000lb 13...,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,https://www.wildberries.ru/catalog/162315454/f...,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è. –ù–æ –≤ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –Ω–∏ —Å–ª–æ–≤–∞ –ø—Ä–æ ...,–õ–µ–±—ë–¥–∫–∞ —Ö–æ—Ä–æ—à–∞—è.,4,True


In [8]:
# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è
logging.basicConfig(filename='./reviews_keywords/clustering.log', 
                    level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ spaCy –¥–ª—è —Ä—É—Å—Å–∫–æ–≥–æ —è–∑—ã–∫–∞
nlp = spacy.load("ru_core_news_lg")

# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ —Å—Ç–æ–ø-—Å–ª–æ–≤
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:


# –ü–µ—Ä–µ–≤–æ–¥ –º–æ–¥–µ–ª–∏ –≤ —Ä–µ–∂–∏–º FP16, –µ—Å–ª–∏ —ç—Ç–æ –≤–æ–∑–º–æ–∂–Ω–æ
if torch.cuda.is_available():
    model_classification = model_classification.half()

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è —Ü–µ–Ω—Ç—Ä–∞ –∫–ª–∞—Å—Ç–µ—Ä–∞ (—Ü–µ–Ω—Ç—Ä–æ–∏–¥–∞)
def find_centroid(embeddings):
    return np.mean(embeddings, axis=0)

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
def compute_sentence_embeddings(sentences):
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ—Å—Ç—å –¥–∞–Ω–Ω—ã—Ö
    if not all(isinstance(sentence, str) and sentence.strip() for sentence in sentences):
        raise ValueError("All items in the input must be non-empty strings.")
    
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model_classification(**inputs)
        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –Ω–∞–ª–∏—á–∏–µ —Å–∫—Ä—ã—Ç—ã—Ö —Å–æ—Å—Ç–æ—è–Ω–∏–π
        if outputs.hidden_states is None:
            raise ValueError("–ú–æ–¥–µ–ª—å –Ω–µ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç —Å–∫—Ä—ã—Ç—ã–µ —Å–æ—Å—Ç–æ—è–Ω–∏—è. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é –º–æ–¥–µ–ª–∏.")
        # –ü–æ–ª—É—á–∞–µ–º –ø–æ—Å–ª–µ–¥–Ω–∏–µ —Å–∫—Ä—ã—Ç—ã–µ —Å–æ—Å—Ç–æ—è–Ω–∏—è
        hidden_states = outputs.hidden_states[-1]
    embeddings = hidden_states.mean(dim=1).cpu().numpy()
    return embeddings

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –Ω–∞—Ö–æ–∂–¥–µ–Ω–∏—è –∫–ª—é—á–µ–≤–æ–π –º—ã—Å–ª–∏ –≤ –∫–ª–∞—Å—Ç–µ—Ä–µ
def extract_key_thought(cluster_sentences):
    sentences = cluster_sentences.split(" | ")
    
    embeddings = compute_sentence_embeddings(sentences)
    
    centroid = find_centroid(embeddings)
    similarities = cosine_similarity(embeddings, [centroid])
    key_sentence_index = np.argmax(similarities)
    
    return sentences[key_sentence_index]

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–æ–¥—Å—á–µ—Ç–∞ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —Å–ª–æ–≤ –≤ –∫–∞–∂–¥–æ–º –∫–ª–∞—Å—Ç–µ—Ä–µ
def count_words(cluster_sentences):
    words = cluster_sentences.split()
    return len(words)

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–æ–≤—Ç–æ—Ä–Ω–æ–π –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ –∫—Ä—É–ø–Ω—ã—Ö –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
def recluster_large_cluster(cluster_sentences, eps=0.1, min_samples=2):
    sentences = cluster_sentences.split(" | ")
    
    embeddings = compute_sentence_embeddings(sentences)
    
    re_clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(embeddings)
    
    re_cluster_dict = {}
    for idx, label in enumerate(re_clustering.labels_):
        if label == -1:
            continue
        label_str = str(label)
        if label_str not in re_cluster_dict:
            re_cluster_dict[label_str] = []
        re_cluster_dict[label_str].append(sentences[idx])
    
    return [" | ".join(cluster) for cluster in re_cluster_dict.values()]

# –†–µ–∫—É—Ä—Å–∏–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ –∫—Ä—É–ø–Ω—ã—Ö –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
def recursive_clustering(cluster_sentences, threshold, eps=0.22, min_samples=3, min_eps=0.02):
    current_eps = eps
    current_min_samples = min_samples
    new_clusters = [cluster_sentences]

    while True:
        next_clusters = []
        reclustered_any = False
        
        for cluster in new_clusters:
            if count_words(cluster) > threshold:
                while current_eps >= min_eps:
                    reclustered = recluster_large_cluster(cluster, eps=current_eps, min_samples=current_min_samples)
                    
                    if len(reclustered) > 1:
                        next_clusters.extend(reclustered)
                        reclustered_any = True
                        break  # –ö–ª–∞—Å—Ç–µ—Ä —É—Å–ø–µ—à–Ω–æ —Ä–∞–∑–¥–µ–ª–µ–Ω, –≤—ã—Ö–æ–¥–∏–º –∏–∑ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–µ–≥–æ —Ü–∏–∫–ª–∞
                    else:
                        if current_eps > min_eps:
                            current_eps -= 0.05  # –£–º–µ–Ω—å—à–∞–µ–º eps –∏ –ø—Ä–æ–±—É–µ–º —Å–Ω–æ–≤–∞
                
                if len(reclustered) == 1:
                    # –ï—Å–ª–∏ –∫–ª–∞—Å—Ç–µ—Ä —Ç–∞–∫ –∏ –Ω–µ –±—ã–ª —Ä–∞–∑–¥–µ–ª–µ–Ω, –¥–æ–±–∞–≤–ª—è–µ–º –µ–≥–æ –æ–±—Ä–∞—Ç–Ω–æ
                    next_clusters.append(cluster)
            else:
                next_clusters.append(cluster)
        
        new_clusters = next_clusters
        
        if not reclustered_any:
            break
    
    return new_clusters

# –û—Å–Ω–æ–≤–Ω–æ–π –ø—Ä–æ—Ü–µ—Å—Å –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º –∏ –ø—Ä–æ–¥—É–∫—Ç–∞–º
final_result = pd.DataFrame()

# –ì—Ä—É–ø–ø–∏—Ä—É–µ–º –ø–æ category –∏ product
for (category_name, product_name), group in tqdm(dataset_exploded[dataset_exploded["predictions"] == 1].groupby(['category', 'product']), desc="Processing categories and products"):
    all_sentences = group['sentences'].tolist()

    if not all_sentences:
        continue  # –ø—Ä–æ–ø—É—Å—Ç–∏—Ç—å, –µ—Å–ª–∏ –Ω–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π

    try:
        # –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –±–µ–∑ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è –Ω–∞ –±–∞—Ç—á–∏
        all_embeddings = compute_sentence_embeddings(all_sentences)
    except ValueError as e:
        print(f"Error in computing embeddings for product {product_name}: {e}")
        continue

    # –ü—Ä–æ–≥—Ä–µ—Å—Å-–±–∞—Ä –¥–ª—è –Ω–∞—á–∞–ª—å–Ω–æ–π –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º HDBSCAN –∏ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–π –º–µ—Ç—Ä–∏–∫–∏
    distance_matrix = squareform(pdist(all_embeddings, metric='cosine'))
    clustering = hdbscan.HDBSCAN(min_samples=3, metric='precomputed').fit(distance_matrix)

    cluster_dict = {}
    for idx, label in enumerate(clustering.labels_):
        if label == -1:
            continue
        label_str = str(label)
        if label_str not in cluster_dict:
            cluster_dict[label_str] = set()
        cluster_dict[label_str].add(all_sentences[idx])

    clusters = [" | ".join(sentences) for sentences in cluster_dict.values()]

    if not clusters:
        continue  # –ø—Ä–æ–ø—É—Å—Ç–∏—Ç—å, –µ—Å–ª–∏ –Ω–µ—Ç –∫–ª–∞—Å—Ç–µ—Ä–æ–≤

    # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º review_rating –≤ 1 –∏ 0
    group['binary_rating'] = group['review_rating'].apply(lambda x: 1 if x in [4, 5] else 0)

    # –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º —Å—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥ –¥–ª—è —ç—Ç–æ–π –≥—Ä—É–ø–ø—ã
    avg_rating = group['binary_rating'].mean()
    
    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º, positive –∏–ª–∏ negative
    rating_category = 'positive' if avg_rating > 0.7 else 'neutral'
    rating_category = 'neutral' if avg_rating > 0.5 else 'negative'

    # –£—Å–ª–æ–≤–∏–µ –¥–ª—è –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –ø–æ—Ä–æ–≥–æ–≤–æ–≥–æ –∑–Ω–∞—á–µ–Ω–∏—è threshold
    if len(clusters) == 1:
        cluster_word_count = count_words(clusters[0])
        if cluster_word_count > 20:
            threshold = cluster_word_count / 2
        else:
            threshold = cluster_word_count  # –û—Å—Ç–∞–≤–ª—è–µ–º threshold –∫–∞–∫ –µ—Å—Ç—å
    else:
        # –í –ø—Ä–æ—Ç–∏–≤–Ω–æ–º —Å–ª—É—á–∞–µ –∏—Å–ø–æ–ª—å–∑—É–µ–º –∏—Å—Ö–æ–¥–Ω—É—é –ª–æ–≥–∏–∫—É –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ –ø–æ—Ä–æ–≥–∞
        threshold = np.min([np.mean([count_words(cluster) for cluster in clusters]) * 1.5, 250])

    final_clusters = []
    for cluster in clusters:
        if count_words(cluster) > threshold:
            final_clusters.extend(recursive_clustering(cluster, threshold))
        else:
            final_clusters.append(cluster)

    # –£–±–µ–¥–∏—Ç—å—Å—è, —á—Ç–æ –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤ ‚Äî 3
    while len(final_clusters) < 3 and any(count_words(cluster) > threshold for cluster in final_clusters):
        largest_cluster = max(final_clusters, key=count_words)
        final_clusters.remove(largest_cluster)
        new_clusters = recursive_clustering(largest_cluster, threshold)
        
        if len(new_clusters) <= 1:
            final_clusters.append(largest_cluster)
            break

        final_clusters.extend(new_clusters)

    df_exploded_sorted = pd.DataFrame({
        'category': category_name,
        'product': product_name,
        'avg_rating': avg_rating,
        'rating_category': rating_category,
        'cluster_sentences': final_clusters
    })
    df_exploded_sorted['word_count'] = df_exploded_sorted['cluster_sentences'].apply(count_words)
    df_exploded_sorted['key_thought'] = df_exploded_sorted['cluster_sentences'].apply(extract_key_thought)

    df_exploded_sorted = df_exploded_sorted.sort_values(by='word_count', ascending=False)

    final_result = pd.concat([final_result, df_exploded_sorted], ignore_index=True)

# –ü–æ–∫–∞–∑–∞—Ç—å —Ä–µ–∑—É–ª—å—Ç–∞—Ç
display(final_result[['category', 'product', 'avg_rating', 'rating_category', 'cluster_sentences', 'key_thought', 'word_count']])

Processing categories and products: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:07<00:00,  2.27it/s]


Unnamed: 0,category,product,avg_rating,rating_category,cluster_sentences,key_thought,word_count
0,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,"–ù–µ –≤—Å—Ç–∞–≤–∞—Ç—å —Å–∑–∞–¥–∏, –∫–æ–≥–¥–∞ –º–∞—à–∏–Ω–∞ –Ω–∞—á–∏–Ω–∞–µ—Ç –¥–≤–∏–∂–µ...","–•–æ—Ä–æ—à–∏ –∫–æ–≥–¥–∞ –Ω—É–∂–Ω–æ ""–≤—ã—Å–∫–æ—á–∏—Ç—å"" –∏–∑ —Å–Ω–µ–∂–Ω–æ–≥–æ –º–µ—Å...",1081
1,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,"–•–æ—Ä–æ—à–∏–µ —Ç—Ä–∞–∫–∏, –Ω–∞ –æ—â—É–ø—å –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –ø—Ä–æ—á–Ω—ã–µ | –û...","–•–æ—Ä–æ—à–∏–µ —Ç—Ä–∞–∫–∏, –Ω–∞ –æ—â—É–ø—å –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –ø—Ä–æ—á–Ω—ã–µ",15
2,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,–í—Ä–æ–¥–µ –ø—Ä–æ—á–Ω—ã–µ. | –ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ. | –ù–∞ –≤–∏–¥ –ø—Ä–æ—á...,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ.,12
3,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,.. | . | (—Ç–∞–∫ —Å–∫–∞–∑–∞–ª). | (,..,8
4,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,0.842975,neutral,–ò–Ω–æ–≥–¥–∞ –∏—Ö –≤—ã–±—Ä–∞—Å—ã–≤–∞–ª–æ –∏–∑-–ø–æ–¥ –∫–æ–ª–µ—Å. | –•–≤–∞—Ç–∞–ª–æ ...,"–ù–µ –≤—ã—Ä—É—á–∞—Ç—å –¥–∞–∂–µ –ª–µ—Ç–æ–º, —á—É—Ç–æ–∫ —Å–µ–ª –≤ –Ω–µ–±–æ–ª—å—à—É—é ...",432
5,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,0.842975,neutral,"–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª | –¢—Ä–∞–∫–∏ –º–æ—â–Ω—ã–µ, –≤ –¥–µ–ª–µ –µ—â—ë ...",–í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞.,37
6,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Å–∫–æ–ª—å–∫–æ —Ö–≤–∞—Ç–∏—Ç! | –ü–æ—Å–º–æ—Ç—Ä–∏ –∏ –Ω–∞ –∫...,–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Å–∫–æ–ª—å–∫–æ —Ö–≤–∞—Ç–∏—Ç,16
7,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,–º—ã–ú—ã–æ—á–Ω–æ –±—É–¥–µ–º –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –µ—â—ë! | –ó–∞–∫–∞–∂—É –µ—â–µ –Ω–µ...,–ë—É–¥—É –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –µ—â—ë.,13
8,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,.. | –¥‚Ä¶. | .,..,5
9,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,"–§–æ—Ç–æ ¬´–¥–æ¬ª –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ —Å–¥–µ–ª–∞–ª–∞, —Ç–æ–ª—å–∫–æ ¬´–ø–æ—Å–ª...","–§–æ—Ç–æ ¬´–¥–æ¬ª –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ —Å–¥–µ–ª–∞–ª–∞, —Ç–æ–ª—å–∫–æ ¬´–ø–æ—Å–ª–µ¬ª",24


In [10]:
final_result

In [None]:
# –£–¥–∞–ª–µ–Ω–∏–µ –∑–∞–ø–∏—Å–µ–π —Å word_count <= 10 –∏ –∫–ª—é—á–µ–≤–æ–π –º—ã—Å–ª—å—é –º–µ–Ω–µ–µ 3 —Å–∏–º–≤–æ–ª–æ–≤
final_result = final_result[((final_result['word_count'] > 10) & (final_result['key_thought'].str.len() > 5))]
final_result

Unnamed: 0,category,product,avg_rating,rating_category,cluster_sentences,word_count,key_thought
0,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,"–ù–µ –≤–æ –≤—Å–µ—Ö —Å–ª—É—á–∞—è—Ö, –∫–æ–Ω–µ—á–Ω–æ, —ç—Ç–∏ –ê–Ω—Ç–∏ –±—É–∫—Å—ã –º–æ...",1081,"–•–æ—Ä–æ—à–∏ –∫–æ–≥–¥–∞ –Ω—É–∂–Ω–æ ""–≤—ã—Å–∫–æ—á–∏—Ç—å"" –∏–∑ —Å–Ω–µ–∂–Ω–æ–≥–æ –º–µ—Å..."
1,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,"–û—Ç–ª–∏—á–Ω—ã–µ —Ç—Ä–∞–∫–∏, –∏—Å–ø—ã—Ç–∞–ª–∏ –Ω–∞ –≥–∞–∑–µ–ª–∏ | –•–æ—Ä–æ—à–∏–µ —Ç...",15,"–•–æ—Ä–æ—à–∏–µ —Ç—Ä–∞–∫–∏, –Ω–∞ –æ—â—É–ø—å –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –ø—Ä–æ—á–Ω—ã–µ"
2,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–ü–ú / –ê–Ω—Ç–∏–±—É–∫—Å - –∞–Ω—Ç–∏–ø—Ä–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ —É—Ç–æ–ª...,0.819588,neutral,–ù–∞ –≤–∏–¥ –ø—Ä–æ—á–Ω—ã–µ –∏ –∫–æ–ª—é—á–∏–µ. | –ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ. | ...,12,–ù–∞ –≤–∏–¥ –∫—Ä–µ–ø–∫–∏–µ.
4,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,0.842975,neutral,"–í—ã—Ä—É —á–∞–π–∫–∞ –Ω–∞ —Å–∞–º–æ–º –¥–µ–ª–µ —Ç–∞—Ö—Ç–∞, –∑–∞—Å—Ç—Ä—è–ª –≤ —Å—É–≥—Ä...",432,"–ù–µ –≤—ã—Ä—É—á–∞—Ç—å –¥–∞–∂–µ –ª–µ—Ç–æ–º, —á—É—Ç–æ–∫ —Å–µ–ª –≤ –Ω–µ–±–æ–ª—å—à—É—é ..."
5,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/OFFroad,–í–´–†–£–ß–ê–ô–ö–ê / –ê–Ω—Ç–∏–±—É–∫—Å –ü—Ä–æ—Ç–∏–≤–æ–±—É–∫—Å–æ–≤–æ—á–Ω—ã–µ —Ç—Ä–∞–∫–∏ ...,0.842975,neutral,", –Ω–æ –≤ –¥–µ–ª–µ –µ—â—ë –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∏ | –í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ...",37,–í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞.
6,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Å–∫–æ–ª—å–∫–æ —Ö–≤–∞—Ç–∏—Ç! | –ü–æ—Å–º–æ—Ç—Ä–∏ –∏ –Ω–∞ –∫...,16,–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Å–∫–æ–ª—å–∫–æ —Ö–≤–∞—Ç–∏—Ç
7,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–∞—Ö–Ω–µ—Ç –∏ –¢–æ—á–∫–∞ / –ê—Ä–æ–º–∞—Ç–∏–∑–∞—Ç–æ—Ä –≤ –º–∞—à–∏–Ω—É –∞–≤—Ç–æ–ø–∞—Ä...,0.704545,neutral,–ó–∞–∫–∞–∂—É –µ—â–µ –Ω–µ —Ä–∞–∑! | –ë—É–¥—É –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –µ—â—ë. | –º—ã...,13,–ë—É–¥—É –∑–∞–∫–∞–∑—ã–≤–∞—Ç—å –µ—â—ë.
9,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,"–ö —Å–æ–∂–∞–ª–µ–Ω–∏—é –∑–∞–±—ã–ª —Å–¥–µ–ª–∞—Ç—å —Ñ–æ—Ç–æ ""–¥–æ"" | –§–æ—Ç–æ –Ω–µ ...",24,"–§–æ—Ç–æ ¬´–¥–æ¬ª –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ —Å–¥–µ–ª–∞–ª–∞, —Ç–æ–ª—å–∫–æ ¬´–ø–æ—Å–ª–µ¬ª"
10,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,–û—Ç—Ç–∏—Ä–∞–ª –∞–≤—Ç–æ–º–æ–±–∏–ª—å–Ω—ã–π –Ω–æ–º–µ—Ä –æ—Ç —Å–ª–µ–¥–æ–≤ —Ä–∂–∞–≤—ã—Ö –±...,24,"–£–¥–∞–ª—è–ª ""–∂—É—á–∫–∏"" –Ω–∞ –¥–≤–µ—Ä—è—Ö –∞–≤—Ç–æ."
11,/–ê–≤—Ç–æ—Ç–æ–≤–∞—Ä—ã/–ê–≤—Ç–æ–∫–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –∞–≤—Ç–æ—Ö–∏–º–∏—è,–ü–æ–ª–∏–ö–æ–º–ü–ª–∞—Å—Ç / –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å –æ—á–∏—Å—Ç–∏—Ç–µ–ª—å —Ä–∂–∞–≤...,0.853933,neutral,–í –¥–µ–ª–µ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª | –í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞....,14,–í –¥–µ–ª–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞–ª–∞.


In [None]:
final_result.to_csv("./reviews_keywords/feedbackfueltest.csv")