In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline



In [None]:
# путь к data
DATA_PATH = "../../data/task_2/"
# Глобальное значение "random_state" 
STATE = 42

In [None]:
train_data = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
test_data = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))
print("Number of rows and columns in the train data set:", train_data.shape)
print("Number of rows and columns in the valid data set:", test_data.shape)
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data['rate'].unique()

In [None]:
train_data['rate'].hist(); 

In [None]:
le = LabelEncoder()

train_data['rate'] = le.fit_transform(train_data['rate'])

# Оптимизация типов данных
train_data['rate'] = train_data['rate'].astype('uint8')

train_data.head()

In [None]:
train_data.info()

## Препроцессинг


word2vec:

65:
Download	100	10	Russian CoNLL17 corpus
3338424	Word2Vec Continuous Skipgram	False

204:
204	Download	300	2	Russian National Corpus
Russian Wikipedia dump of December 2018
Russian News from Dialogue Evaluation 2020
Araneum Russicum Maximum
998459	Gensim Continuous Bag-of-Words	True
True
True
True

In [None]:
import re
import nltk
import pymorphy2
from nltk.corpus import stopwords

# Путь к модели
path_to_model = DATA_PATH + '65/model.bin'

# Загрузка предварительно обученной модели (может потребовать много времени и памяти)
word_vectors = KeyedVectors.load_word2vec_format(path_to_model, binary=True)

# Загрузка русских стоп-слов
nltk.download('stopwords')
russian_stopwords = stopwords.words('russian')

# Инициализация анализатора pymorphy2
morph = pymorphy2.MorphAnalyzer()

def preprocess_text(text):
    # Удаление лишних символов и нормализация
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    
    # Токенизация
    words = nltk.word_tokenize(text, language="russian")
    
    # Удаление стоп-слов и лемматизация
    words = [morph.parse(word)[0].normal_form for word in words if word not in russian_stopwords]
    
    return ' '.join(words)

## Получение вектора для слова

In [None]:
word = 'компьютер'
if word in word_vectors:
    vector = word_vectors[word]
    print(f"Вектор для слова '{word}': {vector[:10]}")  # Печатаем первые 10 элементов для примера
else:
    print(f"Слово '{word}' не найдено в модели.")

## Поиск похожих слов

In [None]:
# Поиск похожих слов
similar_words = word_vectors.most_similar(word)
print(f"Слова, похожие на слово '{word}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity}")


## Векторизация текста с использованием Word2Vec

In [None]:
def document_vector(word_vectors, doc):
    """Создание вектора для документа по среднему векторам слов."""
    words = doc.split()
    # Создание списка векторов для слов, найденных в модели
    vectors = [word_vectors[word] for word in words if word in word_vectors.key_to_index]
    
    # Если векторы найдены, возвращаем средний вектор, иначе вектор нулей
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word_vectors.vector_size)


## Предварительная обработка текста

In [None]:
train_data['clear_text'] = train_data['text'].apply([preprocess_text])

train_data = train_data.drop_duplicates(subset='clear_text', keep='last')

train_data['vector'] = train_data['clear_text'].apply(lambda doc: document_vector(word_vectors, doc))

In [None]:
train_vectors = np.array(list(train_data['vector'].values))
train_labels = train_data['rate'].values

In [None]:
test_data['clear_text'] = test_data['text'].apply([preprocess_text])
test_data['vector'] = test_data['clear_text'].apply(lambda doc: document_vector(word_vectors, doc))

In [None]:
test_vectors = np.array(list(test_data['vector'].values))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_vectors, train_labels, test_size=0.025, random_state=42, shuffle=True)
# X_train, X_test, y_train, y_test = train_test_split(train_vectors, train_labels, test_size=0.025, random_state=42, stratify=train_data['rate'], shuffle=True)

In [None]:
from catboost import Pool

# Создание пулов данных для CatBoost
train_pool = Pool(data=pd.DataFrame(X_train), label=y_train)
valid_pool = Pool(data=pd.DataFrame(X_test), label=y_test)


# Модель

In [None]:
from catboost import CatBoostClassifier
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(task_type='CPU',
								iterations = 500,
								eval_metric='TotalF1',
								od_type='Iter', 
								od_wait=500,
								depth=10,
								loss_function='MultiClass',
								random_seed=55,
								l2_leaf_reg=5.0,
								border_count=32,
								**kwargs)
    
    return model.fit(train_pool, eval_set=test_pool, 
                     verbose=5, plot=True, 
                     use_best_model=True)

# Обучение

In [None]:
model = fit_model(train_pool, valid_pool, learning_rate=0.35,
                  dictionaries = [{
                      'dictionary_id':'Word',
                      'max_dictionary_size': '50000'
                  }],
                 feature_calcers = ['BoW:top_tokens_count=10000'])

# Предикт

# Формирование решения

In [None]:
y_pred = model.predict(test_vectors)
pred_labels = le.inverse_transform(y_pred)

print(pred_labels)

sample_submission = pd.read_csv(os.path.join(DATA_PATH, "sample_submission.csv"))
sample_submission["rate"] = pred_labels
sample_submission.head()


In [None]:
sample_submission.to_csv(DATA_PATH+"submission.csv", index=False)