In [1]:
import re
import html
import os
import time
import json
from io import BytesIO
from tqdm.notebook import tqdm
from collections import Counter

from mistralai import Mistral
import pandas as pd



## Data Import

In [2]:
class_mapping = {
    "G": 1,
    "N": 0,
    "B": 2
}

In [3]:
comments_500 = pd.read_excel("dataset_comments.xlsx")
comments_35 = pd.read_excel("dataset_comments_35.xlsx")

In [4]:
comments_35["Class"] = comments_35["Class"].map(class_mapping)

## Data Cleaning

In [5]:
def remove_html_tags(text):
    """Remove HTML tags from a string and handle artifacts like entities, extra spaces, and punctuation spacing."""
    clean_text = re.sub(r'<[^>]+>', '', text)
    clean_text = html.unescape(clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    clean_text = clean_text.strip()
    clean_text = re.sub(r'([.,!?])(?=\S)', r'\1 ', clean_text)
    return clean_text

In [6]:
comments_35["MessageText"] = comments_35["MessageText"].apply(remove_html_tags)
comments_500["MessageText"] = comments_500["MessageText"].apply(remove_html_tags)

In [7]:
comments_35.to_csv("comments_35.csv", index=False)
comments_500.to_csv("comments_500.csv", index=False)

## Synthetic data generation

In [8]:


client = Mistral(api_key="<INSERT_YOUR_KEY_HERE>")

system_prompt = """Role: Professional Text Classifier

Task: Classify Russian messages into 3 categories:
0: NEUTRAL - Emotionless information, discussions without sentiment
1: POSITIVE - Clear positive sentiment (joy, gratitude, admiration, congratulations, even if formal or polite)
2: NEGATIVE - Negative emotions (anger, insults, harsh criticism, dissatisfaction)

Rules:
1. Positive texts include formal positivity (e.g., birthday congratulations, polite expressions).
2. Negative texts dominate if any negative language are present, even alongside positive elements.
3. Neutral texts are purely informational, without emotional tone.
4. Return ONLY the numeric class (0/1/2) in JSON format.
5. Make 5 INDEPENDENT classifications per text.
6. Never explain your choice.

Output Format: {"class": <0|1|2>}"""

def classify_text(text: str) -> dict:
    classifications = []
    
    response = client.chat.complete(
        model="mistral-large-latest",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text}
        ],
        response_format={"type": "json_object"},
        temperature=0.7,
        max_tokens=50,
        n=5
    )

    for choice in response.choices:
        try:
            result = json.loads(choice.message.content)
            classifications.append(int(result["class"]))
        except:
            classifications.append(-1)

    
    counts = Counter(classifications)
    final_label = counts.most_common(1)[0][0]
    
    return {
        "text": text,
        "labels": classifications,
        "final_label": final_label,
        "confidence": "high" if counts[final_label] >= 4 else "low"
    }

In [114]:
texts = comments_500["MessageText"].to_list()
results = []

for t in tqdm(texts):
    results.append(classify_text(t))
    time.sleep(2)

  0%|          | 0/508 [00:00<?, ?it/s]

In [123]:
mistral_labels = pd.DataFrame(results)

In [124]:
result_df = pd.concat([comments_500, mistral_labels], axis=1)
result_df.to_csv("comments_500_mistral_labeled.csv", index=False)

In [14]:
result_df = pd.read_csv("comments_500_mistral_labeled.csv")

## Работаем с разметкой

In [15]:
result_df = result_df.drop(columns=["text"])

In [16]:
result_df.loc[result_df.final_label==-1]

Unnamed: 0,UserSenderId,SubmitDate,MessageText,labels,final_label,confidence
52,17798,2023-01-24 12:48:46.057,Под MacBook Air 13 отлично подошла,"[-1, 1, -1, 0, -1]",-1,low
430,8906,2014-02-10 14:33:39.073,Loki,"[-1, -1, -1, -1, -1]",-1,high
433,8874,2014-02-07 12:28:33.240,Phoenix,"[-1, -1, -1, -1, -1]",-1,high


Отбросим примеры в которых модель ничего не поняла (и я тоже)

In [17]:
result_df_clear = result_df.loc[result_df.final_label!=-1]

In [18]:
result_df_clear

Unnamed: 0,UserSenderId,SubmitDate,MessageText,labels,final_label,confidence
0,14771,2023-05-16 19:27:21.773,В общем podman - это не наш метод. На нем рабо...,"[2, 2, 2, 2, 2]",2,high
1,8194,2023-05-16 16:56:35.380,"А зачем ввод ИНН? Если его нет у компании, то ...","[0, 2, 0, 2, 0]",0,low
2,18355,2023-05-16 15:08:17.097,сориентируйте по размерам пожалуйста,"[0, 0, 0, 0, 0]",0,high
3,18551,2023-05-16 09:33:51.050,"толстовка хорошая, но начес лезет сильно. наде...","[2, 2, 2, 2, 2]",2,high
4,14634,2023-05-15 14:21:37.207,Здорово! Нащупали идеальный формат 😎👍👍Из предл...,"[1, 1, 1, 1, 1]",1,high
...,...,...,...,...,...,...
503,8340,2018-12-12 09:10:29.027,Классная идея! ! ! ),"[1, 1, 1, 1, 1]",1,high
504,14771,2018-12-11 11:16:14.397,Бессмысленный товар. Я думал светится хотя бы ...,"[2, 2, 2, 2, 2]",2,high
505,9190,2018-12-11 09:16:55.903,"Коллеги, большое спасибо за поздравления :)","[1, 1, 1, 1, 1]",1,high
506,8400,2018-12-11 08:45:17.930,Вчера прикупил. Очень вкусно.,"[1, 1, 1, 1, 1]",1,high


## New dataset

In [19]:
client = Mistral(api_key="T2pzSdDaEsXIctzSlWmbKSaPrK6SFnDD")

system_prompt = """Role: Synthetic Message Generator

Task: Generate 10 diverse natural Russian messages matching:
- Overall sentiment of provided example
- General thematic/style of provided example
- Absolutely unique content with varied contexts

Rules:
1. Focus on general sentiment of provided example
2. Generate non-obvious messages without direct emotional words
3. Ensure diversity:
   - Different contexts/scenarios
   - Varied sentence structures
   - Unique topics for each message matching original example thematic 
4. Never repeat phrases or paraphrase
5. Follow example message length
6. Output format: {"messages": ["msg1", "msg2", ..., "msg10"]}"""

def generate_synthetic_messages(example_message: str = None) -> list[str]:    
    dynamic_prompt = f"\n\nStyle and thematic example (DO NOT REPLICATE): {example_message}"
    
    sentiment_definitions = {
        0: "NEUTRAL: Pure information, technical discussions, emotionless exchanges",
        1: "POSITIVE: Contains subtle positivity without obvious emotional words",
        2: "NEGATIVE: Shows dissatisfaction"
    }
        
    try:
        response = client.chat.complete(
            model="mistral-large-latest",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": dynamic_prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.7,
            max_tokens=5000
        )
        
        data = json.loads(response.choices[0].message.content)
        
        if "messages" not in data:
            raise ValueError("Invalid response format")
            
        messages = [msg.strip() for msg in data["messages"] if isinstance(msg, str)]
        
        seen = set()
        unique_messages = []
        for msg in messages:
            lower_msg = msg.lower()
            if (len(msg) > 20 and 
                lower_msg not in seen and 
                not any(lower_msg in s for s in seen)):
                seen.add(lower_msg)
                unique_messages.append(msg)
        
        return unique_messages[:10]

    except Exception as e:
        print(f"Generation error: {e}")
        return []

In [20]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    result_df_clear[['MessageText', 'final_label']],
    test_size=0.3,
    stratify=result_df_clear['final_label'],
    random_state=42
)

In [22]:
def generate_synthetic_dataset(original_df, samples_per_row=10):
    synthetic_data = []
    
    for _, row in tqdm(original_df.iterrows()):
        try:
            # Generate synthetic messages
            generated = generate_synthetic_messages(
                example_message=row['MessageText']
            )
            
            # Add generated messages with original label
            for msg in generated[:samples_per_row]:  # Ensure max samples_per_row
                synthetic_data.append({
                    'MessageText': msg,
                    'final_label': row['final_label']
                })
                
        except Exception as e:
            print(f"Error processing row {_}: {str(e)}")
            continue
    
    return pd.DataFrame(synthetic_data)

train_synthetic = generate_synthetic_dataset(train_df)
test_synthetic = generate_synthetic_dataset(test_df)

train_synthetic.to_csv('train_synthetic.csv', index=False)
test_synthetic.to_csv('test_synthetic.csv', index=False)

print("Original Train shape:", train_df.shape)
print("Synthetic Train shape:", train_synthetic.shape)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Original Train shape: (353, 2)
Synthetic Train shape: (3353, 2)


In [23]:
for i in train_synthetic.MessageText.to_list():
    print(i)
    print("+++++++++++ХУЙ+++++++")

Идея интересная, но некоторые детали выглядят недоработанными (например, крылья бабочки кажутся слишком простыми). Зато текстура перьев и чешуя рыбы выглядят реалистично, что придаёт композиции завершённость.
+++++++++++ХУЙ+++++++
Концепция выставки уникальная, но освещение в некоторых залах слишком тусклое (картины в зале импрессионистов теряются на фоне стен). Зато скульптуры и инсталляции в центральном холле выглядят впечатляюще, создавая атмосферу погружения в искусство.
+++++++++++ХУЙ+++++++
Проект амбициозный, но часть функционала оставляет желать лучшего (навигация по сайту кажется запутанной). Зато дизайн главной страницы и анимации выполнены на высоком уровне, что привлекает внимание пользователей.
+++++++++++ХУЙ+++++++
Рецепт оригинальный, но некоторые ингредиенты не раскрывают свой вкус (лимонное масло теряется на фоне других специй). Зато текстура готового блюда и аромат вызывают аппетит, что делает его достойным внимания.
+++++++++++ХУЙ+++++++
Фильм захватывающий, но некот

In [24]:
result_df_clear.to_csv("comments_500_mistral_labeles_clear.csv", index=False)

In [25]:
test_df.to_csv("clear_test.csv")