In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import string
import emoji
from transformers import AutoTokenizer, AutoModel
from torch import cuda
from tqdm import tqdm
import torch.nn.functional as F
import torch
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from transformers import AutoTokenizer, AutoModel
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import balanced_accuracy_score,f1_score,accuracy_score,confusion_matrix
import optuna
from tqdm import tqdm
import os
from matplotlib import pyplot as plt
np.random.seed(0)

In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/Znatno_personal/Отчетность ВКР/Артефакты/Parsing_avito/Model for benchmark/data/train - train (3).csv')

In [None]:
data_test = pd.read_csv('/content/drive/MyDrive/Znatno_personal/Отчетность ВКР/Артефакты/Parsing_avito/Model for benchmark/data/test_data.csv')

In [None]:
X = data_train['message.text']
y = data_train['Оценка']
X_test = data_test['message.text']
y_test = data_test['Оценка']

# Предобработка

In [None]:
def convert_emojis_to_words(text):

    # Convert emojis to words
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Remove the : from the words and replace _ with space
    text = text.replace("_", " ")

    return text

In [None]:
symbols_pattern = re.compile(pattern = "["
    "@_!#$%^&*()<>?/\|}{~√•—"
                       "]+", flags = re.UNICODE) #спецсимволы
# двойные пробелы
space_pattern = re.compile('\s+')
#exclude = set(string.punctuation) # удаление пунктуации
def clear_text(text):
    """ Функция удаления спецсимволов"""
    # удаление спецсимволов и emoji
    pre = symbols_pattern.sub(r'',text)
    pre = re.sub(r'http\S+', '', pre)
    pre = convert_emojis_to_words(pre)


    return space_pattern.sub(' ', pre)

In [None]:
def preprocess_text(text):
    """ Финальная функция для обработки """
    # srip + lower + punctuation
    sentence = (
        ''.join([x for x in str(text).strip().lower()])
    )

    return clear_text(sentence)

In [None]:
X = X.apply(preprocess_text)

In [None]:
X_test = X_test.apply(preprocess_text)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state = 0)

# **TF-IDF + LogisticRegression**

In [None]:
numeric_features = ["Стаж работы","Возраст"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["Тип занятости", "Сфера деятельности", "Образование", "Пол"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
text_transformer = Pipeline(
    steps = [
        ("vectorizer", TfidfVectorizer())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("text", text_transformer,'user_text_info')
    ]
)

In [None]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=2000))]
)


clf.fit(X_train, y_train)
print("Accuracy score: %.3f" % clf.score(X_test, y_test))

In [None]:
pred = clf.predict(X_test)

In [None]:
balanced_accuracy = balanced_accuracy_score(y_test, pred)
print("Balanced Accuracy:", balanced_accuracy)

In [None]:
f1_ = f1_score(y_test, pred, average='weighted')
print("F1 score:", f1_)

# **Multilingual-e5-large-instruct**

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large-instruct")
model = AutoModel.from_pretrained("intfloat/multilingual-e5-large-instruct")
model.to(device)

In [None]:
def embed_text(text):
    t = tokenizer(text, padding= True, truncation=True, max_length=512,  return_tensors='pt' )
    with torch.no_grad():
      model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [None]:
x = np.zeros((len(X_train), 1024))
y =np.array(y_train)


In [None]:
for ind, sent in enumerate(tqdm(X_train)):
   x[ind] = embed_text(sent)

In [None]:
x_test = np.zeros((len(X_test), 1024))
y_test =np.array(y_test)

In [None]:
for ind, sent in enumerate(tqdm(X_test)):
  x_test[ind] = embed_text(sent)

In [None]:
x_val = np.zeros((len(X_val), 1024))
y_val =np.array(y_val)

In [None]:
for ind, sent in enumerate(tqdm(X_val)):
  x_val[ind] = embed_text(sent)

## LogisticRegression

In [None]:
logistic_reg = LogisticRegression(max_iter=2000)
logistic_reg.fit(x, y)

In [None]:
pred = logistic_reg.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, pred)
print("Accuracy for Logistic Regression:", accuracy_score(y_test, pred))
print("F1 score for Logistic Regression:",f1_score(y_test, pred, average='weighted'))
print("Balanced accuracy for Logistic Regression:", balanced_accuracy_score(y_test,pred))

In [None]:
def conf_matrix(y_test, pred):
  cm = confusion_matrix(y_test, pred, labels=np.unique(y_test))

  fig, ax = plt.subplots(figsize=(6,6))
  im = ax.imshow(cm, interpolation='nearest', aspect='auto')
  unique_labels = np.unique(y_test).astype(int)
  ax.set_xticks(range(9))
  ax.set_yticks(range(9))
  ax.set_xticklabels(unique_labels, rotation=45)
  ax.set_yticklabels(unique_labels)
  ax.set_ylabel('Истинный класс')
  ax.set_xlabel('Предсказанный класс')
  fig.colorbar(im, ax=ax)
  plt.tight_layout()
  plt.show()

In [None]:
def get_metrics_per_class(y_test, pred):
  classes = np.unique(y_test)

  # 1) F1-score по классам
  f1_per_class = f1_score(y_test, pred, labels=classes, average=None, zero_division=0)

  # 2) «Точность по классу» как доля правильно предсказанных среди всех примеров этого класса
  #    (этот показатель в мультиклассе эквивалентен recall для данного класса)
  accuracy_per_class = np.array([
      np.mean(pred[y_test == cls] == cls)
      for cls in classes
  ])

  # 3) Собираем всё в DataFrame
  df = pd.DataFrame({
      'class': classes.astype(str),
      'accuracy': accuracy_per_class,
      'f1_score': f1_per_class
  }).set_index('class')
  return df

In [None]:
conf_matrix(y_test, pred)

In [None]:
get_metrics_per_class(y_test, pred)

## XGB

In [None]:
def objective(trial):
    params = {
        "objective": "multi:softmax",
        "num_class": 9,
        "eval_metric": "mlogloss",
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.0001, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 100),
        "n_estimators": trial.suggest_int("n_estimators", 50, 2000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0)
    }

    model = XGBClassifier(**params)
    model.fit(x, y)

    y_pred = model.predict(x_val)
    return accuracy_score(y_val, y_pred)

# Запуск оптимизации
study = optuna.create_study(direction="maximize")  # Ищем max accuracy
study.optimize(objective, n_trials=50)

# Лучшие параметры
print("Лучшие параметры:", study.best_params)

In [None]:
xgb = XGBClassifier(objective="multi:softmax", num_class=9, eval_metric="mlogloss", learning_rate=0.07078025811321921, max_depth=46, n_estimators = 1284, subsample=0.5458605221291573, colsample_bytree= 0.15268221810604315)
xgb.fit(x, y)
# Предсказания и оценка
y_pred = xgb.predict(x_test)
print("Accuracy for XGB:", accuracy_score(y_test, y_pred))
print("F1 score for XGB:",f1_score(y_test, y_pred, average='weighted'))
print("Balanced accuracy for XGB:",balanced_accuracy_score(y_test,y_pred))

In [None]:
conf_matrix(y_test, y_pred)

In [None]:
get_metrics_per_class(y_test, y_pred)

# GigaChat

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
!pip install transformers==4.46.0

In [None]:
model1 = AutoModel.from_pretrained('ai-sage/Giga-Embeddings-instruct', trust_remote_code=True)

In [None]:
model1.to(device)

In [None]:
def get_embeddings(text):
  task_name_to_instruct = {"example": "получая резюме, присвой ему метку класса от 0 до 8"}
  query_prefix = task_name_to_instruct["example"] + "\nрезюме: "
  queries = [text]
  query_embeddings = model1.encode(queries, instruction=query_prefix)
  query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
  return query_embeddings[0].cpu().numpy()

In [None]:
x1 = np.zeros((len(X_train), 2048))

In [None]:
y1 = np.array(y_train)

In [None]:
for ind, sent in enumerate(tqdm(X_train)):
   x1[ind] =  get_embeddings(sent)

In [None]:
x1_test = np.zeros((len(X_test), 2048))

In [None]:
y1_test = np.array(y_test)

In [None]:
for ind, sent in enumerate(tqdm(X_test)):
   x1_test[ind] =  get_embeddings(sent)

In [None]:
x1_val = np.zeros((len(X_val), 2048))
y1_val = np.array(y_val)

In [None]:
for ind, sent in enumerate(tqdm(X_val)):
   x1_val[ind] =  get_embeddings(sent)

In [None]:
logistic_reg = LogisticRegression(max_iter=2000)
logistic_reg.fit(x1, y1)

In [None]:
pred = logistic_reg.predict(x1_test)

In [None]:
print("Accuracy for Logistic Regression:", accuracy_score(y1_test, pred))
print("F1 score for Logistic Regression:", f1_score(y1_test, pred, average='weighted'))
print("Balanced accuracy for Logistic Regression:",balanced_accuracy_score(y1_test,pred))

In [None]:
conf_matrix(y_test, pred)

In [None]:
get_metrics_per_class(y_test, pred)

In [None]:
def objective(trial):
    params = {
        "objective": "multi:softmax",
        "num_class": 9,
        "eval_metric": "mlogloss",
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 100),
        "n_estimators": trial.suggest_int("n_estimators", 50, 2000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0)
    }

    model = XGBClassifier(**params)
    model.fit(x1, y1)

    y1_pred = model.predict(x1_val)
    return accuracy_score(y1_val, y1_pred)

# Запуск оптимизации
study = optuna.create_study(direction="maximize")  # Ищем max accuracy
study.optimize(objective, n_trials=50)

# Лучшие параметры
print("Лучшие параметры:", study.best_params)

In [None]:
xgb1 = XGBClassifier(objective="multi:softmax", num_class=9, eval_metric="mlogloss", learning_rate=0.07078025811321921, max_depth=45, n_estimators = 700, subsample=0.5458605221291573, colsample_bytree= 0.15268221810604315)
xgb1.fit(x1, y1)
# Предсказания и оценка
y_pred2 = xgb1.predict(x1_test)
print("Accuracy for XGB:", accuracy_score(y1_test, y_pred2))
print("F1 score for XGB:",f1_score(y1_test, y_pred2, average='weighted'))
print("Balanced accuracy for XGB:",balanced_accuracy_score(y1_test,y_pred2))

In [None]:
conf_matrix(y_test, y_pred2)

In [None]:
get_metrics_per_class(y_test, y_pred2)