In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TimeDistributed, Dense, RepeatVector, LSTM, Activation
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem.porter import *
import nltk
from catboost import CatBoostClassifier
from nltk.stem import WordNetLemmatizer
from itertools import permutations, product
import pickle

In [None]:
# !python -m spacy download ru_core_news_lg

In [None]:
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

In [None]:
ALPHABET = [chr(i) for i in range(ord('а'),ord('я')+1)] + ['ё']
num_classes = 31
# MAXLEN = 30

In [None]:
with open('/kaggle/input/rucode-d-data/public_test_stresses.txt', 'r') as f:
    test = []
    for line in f:
        test.append(line.rstrip('\n'))

with open('/kaggle/input/rucode-d-data/train_stresses_labels.txt', 'r') as f:
    train = []
    for line in f:
        train.append(line.rstrip('\n'))
        
with open('/kaggle/input/rucode-d-data/sample_submission.txt', 'r') as f:
    sample = []
    for line in f:
        sample.append(line.rstrip('\n'))

In [None]:
target = []
for i in range(len(train)):
    target.append(train[i].index('^'))


In [None]:
np.unique(train), np.unique(train).shape

In [None]:
x, y = np.unique(target, return_counts=True)
x, y

In [None]:
plt.plot(x, y, '-o')
plt.xticks([i for i in range(1, 32, 2)])
plt.show()

In [None]:
df = pd.DataFrame({'sample': train, 'class': target})
df

In [None]:
def to_slogi(word, gl = ['а', 'у', 'о', 'и', 'э', 'ы', 'я', 'ю', 'е', 'ё']):
    ans = []
    slog = ''
    for i in range(len(word)):
        slog = slog + word[i]
        if word[i] in gl:
            ans.append(slog)
            slog = ''
    if slog and ans:
        ans[-1] = ans[-1] + slog
    elif slog:
        ans.append(slog)
        
    return ans

In [None]:
def preproc(df, x_name = 'sample'):
    gl = ['а', 'у', 'о', 'и', 'э', 'ы', 'я', 'ю', 'е', 'ё']
    sogl = ['б', 'в', 'г', 'д', 'ж', 'з', 'й', 'к', 'л', 'м', 'н', 'п', 'р', 'с', 'т', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ъ']
    alphabet = gl + sogl
    stemmer = nltk.SnowballStemmer('russian')
    df[f'clear_{x_name}'] = df[x_name].apply(lambda x: x.replace('^', '').replace('ё', 'е').lower()) 
    # стэминг заменяет ё на е, поэтому для корректной работы функции суфикса необходимо: убрать ё у чистых данных
    df[f'prefix'] = df[f'clear_{x_name}'].apply(lambda x: stemmer.stem(x))
    df['suffix'] = df.apply(lambda row: row['clear_sample'].replace(row['prefix'], ''), axis=1)
    df['len_sample'] = df[f'clear_{x_name}'].apply(lambda x: len(x))
    df['is_yo'] = df['sample'].apply(lambda x: int('ё' in x))
    df['first_gl'] = df['sample'].apply(lambda x: int(x[0] in gl))
    df['cnt_gl'] =  df[f'clear_{x_name}'].apply(lambda x: sum([int(el in gl) for el in x]))
    df['cnt_sogl'] = df[f'clear_{x_name}'].apply(lambda x: sum([int(el in sogl) for el in x]))
    df['ind_glgl'] = df[f'clear_{x_name}'].apply(lambda x: np.array([x[i] in gl and x[i + 1] in gl  for i in range(len(x) - 1)]).argmax(axis=-1)
                                                if any([x[i] in gl and x[i + 1] in gl  for i in range(len(x) - 1)])
                                                else -1)
    for letter in alphabet:
        df.loc[:, f'{alphabet.index(letter)}_cnt_alf'] = df[f'clear_{x_name}'].apply(lambda x: x.count(letter))
    
    df['slogi'] = df['clear_sample'].apply(to_slogi)
#     combs = [''.join(i) for i in product(alphabet, repeat=2)]
#     clear_combs = []
#     for i in range(len(combs)):
#         for el in combs[i]:
#             if el in gl:
#                 clear_combs.append(combs[i])
#                 break
                
#     dict_combs = {}
#     for comb in clear_combs:
#         dict_combs[f'{clear_combs.index(comb)}_cn'] = df[f'clear_{x_name}'].apply(lambda x: x.count(comb))
#     df = pd.concat([df, pd.DataFrame(dict_combs)], axis=1)
    
    return df

In [None]:
df = preproc(df)
df

In [None]:
all_slogs = set()
for arr in df['slogi']:
    for el in arr:
        all_slogs.add(el)

In [None]:
class MyEncoder:
    def __init__(self, vocab, vocab_slogi):
        self.vocab_slogi = {name: i + 1 for i, name in enumerate(sorted(np.unique(list(vocab_slogi))))}
        self.words_to_ind = {name: i + 1 for i, name in enumerate(sorted(np.unique(vocab)))} #близость чисел означает близость в алфавитном порядке строк
        self.ind_to_words = {ind: name for name, ind in self.words_to_ind.items()}
        self.alphabet = [chr(i) for i in range(ord('а'),ord('я')+1)] + ['ё']
        self.str_maxlen = 30
        
    def encode_slogi(self, df: pd.DataFrame, column_name: str) -> pd.DataFrame:
        for i in range(self.str_maxlen):
            df.loc[:, f'{i + 1}_slog'] = df[column_name].apply(lambda x: self.vocab_slogi[x[i]] if i < len(x) and x[i] in self.vocab_slogi else 0)
        return df
    
    def encode(self, arr: pd.Series) -> pd.Series:
        return arr.apply(lambda x: self.words_to_ind[x] if x in self.words_to_ind else 0)
    
    def decode(self, arr: pd.Series) -> pd.Series:
        return arr.applt(lambda x: self.ind_to_words[x])
    
    def encode_letters(self, df: pd.DataFrame, column_name: str) -> pd.DataFrame:
        for i in range(self.str_maxlen):
            df.loc[:, f'{i + 1}_letter'] = df[column_name].apply(lambda x: self.alphabet.index(x[i]) + 1 if i < len(x) and x[i] in self.alphabet else 0)
        return df

In [None]:
enc = MyEncoder(list(df['suffix']) + list(df['prefix']) + [''], all_slogs) # не забываем про пустой символ

In [None]:
df['enc_pref'], df['enc_suff'] = enc.encode(df['suffix']), enc.encode(df['prefix']) 
df

In [None]:
df = enc.encode_slogi(df, 'clear_sample')
df

In [None]:
df = enc.encode_letters(df, 'clear_sample')
df

In [None]:
features_to_drop = ['19_letter',
 '22_slog',
 '24_letter',
 '23_letter',
 '27_letter',
 '25_letter',
 '26_letter',
 '27_slog',
 '28_letter',
 '28_slog',
 '29_letter',
 '26_slog',
 '24_slog',
 '25_slog',
 '29_slog',
 '30_slog',
 '22_letter',
 '21_letter',
 '9_cnt_alf',
 '20_slog',
 '21_slog',
 '23_slog',
 '30_letter']

In [None]:
df = df.drop(columns=['slogi'] + features_to_drop)

In [None]:
df

In [None]:
list(df)

In [None]:
# df['sample_len'] = df['prefix'].apply(lambda x: len(str(x)))
# df

In [None]:
df.iloc[:, 5:]

In [None]:
# df[df['sample_len'] < df['class']]

In [None]:
dh = 10
nt = int((1_000_000) / (2 ** (dh + 1))) # количество параметров модели = nt * 2 ^ (dh + 1) <= 1_000_000
model = CatBoostClassifier(learning_rate=0.35,
                          depth= dh,num_trees = nt,
                        random_state = 7575,
                          loss_function='MultiClass')

model.fit(df.iloc[:, 5:], df['class'])

In [None]:
test_df = pd.DataFrame({'sample': test})
test_df = preproc(test_df)

In [None]:
test_df['enc_pref'], test_df['enc_suff'] = enc.encode(test_df['suffix']), enc.encode(test_df['prefix'])
test_df = enc.encode_letters(test_df, 'clear_sample')
test_df

In [None]:
test_df = enc.encode_slogi(test_df, 'clear_sample')
test_df = test_df.drop(columns=['slogi'] + features_to_drop)

In [None]:
ans = model.predict(test_df.iloc[:, 4:])
ans

In [None]:
model.get_feature_importance()

In [None]:
def save_res(test, ans):
    res = []
    for st, ind in zip(test, ans):
        ind = min(ind[0], len(st))
        st = list(st)
        st = st[:ind] + ['^'] + st[ind:]
        res.append(''.join(st))
    return res

In [None]:
final_res = save_res(test, ans)
final_res[:10]

In [None]:
with open('output_10.txt', 'w') as f:
    for el in final_res:
        f.write(el + '\n')

In [None]:
model.save_model('model_enc_let_10.cbm')

In [None]:
with open('myencoder_10.pkl', 'wb') as outp:
    pickle.dump(enc, outp, pickle.HIGHEST_PROTOCOL)


In [None]:
with open('myencoder_10.pkl', 'rb') as inp:
    x = pickle.load(inp)
    print([i for i in x.words_to_ind.items()][:5])