In [None]:
!pip install emoji

In [None]:
!pip install --upgrade --force-reinstall requests

In [None]:
!pip install setfit

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import emoji
from datasets import Dataset, DatasetDict
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
from sentence_transformers.losses import CosineSimilarityLoss

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Znatno_personal/Отчетность ВКР/Артефакты/Parsing_avito/my_data22.csv')

In [None]:
df = df.drop('Unnamed: 0', axis=1)

In [None]:
def convert_emojis_to_words(text):

    # Convert emojis to words
    text = emoji.replace_emoji(text, replace="")

    # Remove the : from the words and replace _ with space
    text = text.replace("_", " ")

    return text

In [None]:
symbols_pattern = re.compile(pattern = "["
    "@_!#$%^&*()<>?/\|}{~√•—"
                       "]+", flags = re.UNICODE) #спецсимволы
# двойные пробелы
space_pattern = re.compile('\s+')

In [None]:
def clear_text(text):
    """ Функция удаления спецсимволов"""
    # удаление спецсимволов и emoji
    pre = symbols_pattern.sub(r'',text)
    pre = convert_emojis_to_words(pre)

    return space_pattern.sub(' ', pre)

In [None]:
def preprocess_text(text):
    """ Финальная функция для обработки """
    # srip + lower + punctuation
    sentence = (
        ''.join([x for x in str(text).strip().lower()])
    )

    return clear_text(sentence)

In [None]:
df = df.dropna()

In [None]:
df = df.drop_duplicates()

In [None]:
df = df.reset_index(drop=True)

In [None]:
df

In [None]:
df['Оценка'].value_counts()

In [None]:
df['Оценка'] = df['Оценка'].apply(lambda x: 9 if x > 9 else x)

In [None]:
df['Оценка'] = df['Оценка'] - 1

In [None]:
df['info'] = df['О себе'].astype(str) + ' ' + 'Обязанности: ' +  df['Обязанности'].astype(str)+ ' ' + 'Компания: ' + df['Компания'].astype(str) + ' ' + 'Учебные заведения: ' + df['Учебные заведения'].astype(str) + ' ' + 'Стаж работы: ' + df['Стаж работы'].astype(str) + ' ' + 'Образование: ' + df['Образование'].astype(str)

In [None]:
df['info'] = df['info'].apply(preprocess_text)

In [None]:
df, df_test = train_test_split(df, test_size=0.1, random_state=42, stratify = df['Оценка'])

In [None]:
df_train, df_eval = train_test_split(df, test_size=0.1, random_state=42, stratify = df['Оценка'])

In [None]:
x_test = df_test['info']
y_test = df_test['Оценка']

In [None]:
df_train['Оценка'].value_counts()

In [None]:
train_dataset = Dataset.from_pandas(df_train)
eval_dataset = Dataset.from_pandas(df_eval)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
# Simulate the few-shot regime by sampling 64 examples per class
train_dataset = sample_dataset(train_dataset, label_column="Оценка", num_samples=64)

In [None]:
# Load a SetFit model from Hub
num_classes = 9
model = SetFitModel.from_pretrained(
    "intfloat/multilingual-e5-large-instruct",
     labels = [0,1,2,3,4,5,6,7,8]
)


In [None]:
args = TrainingArguments(
    #output_dir="//content/drive/MyDrive/SetFit2/",
    batch_size=16,
    num_epochs=2,
    body_learning_rate = 1e-7,
    use_amp = True,
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="steps",
    #logging_dir = "//content/drive/MyDrive/SetFit/logs",
    report_to = "tensorboard",
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    column_mapping={"info": "text", "Оценка": "label"}  # Map dataset columns to text/label expected by trainer
)

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
pred = model.predict(test_dataset['info'])

In [None]:
print('Accuracy:', accuracy_score(test_dataset['Оценка'], pred))
print('F1 score:', f1_score(test_dataset['Оценка'], pred, average='weighted'))

In [None]:
classes = np.unique(test_dataset['Оценка'])
y_test = np.array(test_dataset['Оценка'])
y_pred = np.array(pred)

# 1) F1-score по классам
f1_per_class = f1_score(y_test, y_pred, labels=classes, average=None, zero_division=0)

# 2) «Точность по классу» как доля правильно предсказанных среди всех примеров этого класса
#    (этот показатель в мультиклассе эквивалентен recall для данного класса)
accuracy_per_class = np.array([
    np.mean(y_pred[y_test == cls] == cls)
    for cls in classes
])

# 3) Собираем всё в DataFrame
df = pd.DataFrame({
    'class': classes.astype(str),
    'accuracy': accuracy_per_class,
    'f1_score': f1_per_class
}).set_index('class')

print(df)