In [None]:
import pandas as pd
import numpy as np
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("data_all.csv")
data["label"] = pd.Series(
    map(
        lambda x: data.columns[2:][x],
        np.argmax(data.loc[:, data.columns[2:]].to_numpy(), axis=1),
    )
)
emoji_counts = data["label"].value_counts().head()
poor_presented_emojis = pd.Series(
    [emoji for emoji in emoji_counts.index if emoji_counts[emoji] < 200]
)
data = data[~data["label"].isin(poor_presented_emojis)]
data.drop(poor_presented_emojis, axis=1, inplace=True)
emoji_counts = emoji_counts[~emoji_counts.index.isin(poor_presented_emojis)]

In [None]:
vectorizer = TfidfVectorizer()
stemmer = PorterStemmer()
texts = data["text"].str.replace("[^А-я]", " ", regex=True).str.lower()
texts = [" ".join([stemmer.stem(word) for word in text.split()]) for text in texts]

In [None]:
train_vectors = vectorizer.fit_transform(texts)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_vectors, data["label"], test_size=0.2
)

In [None]:
default_emoji = emoji_counts.index[0]

In [None]:
n_estimators = 25
learning_rate = 0.03
model = lightgbm.LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
model_default = lightgbm.LGBMClassifier(class_weight="balanced")
y_train_default = y_train == default_emoji
model.fit(X_train, y_train)
model_default.fit(X_train, y_train_default)
preds = model.predict(X_test)
preds_default = model_default.predict(X_test)
preds_result = preds == default_emoji
true_result = y_test == default_emoji
print(accuracy_score(preds, y_test), accuracy_score(preds_result, true_result))

In [None]:
texts_test = pd.Series(["тестовое сообщение"])
texts_test = texts_test.str.replace("[^А-я]", " ", regex=True).str.lower()
texts_test = [
    " ".join([stemmer.stem(word) for word in text.split()]) for text in texts_test
]

In [None]:
test_vectors = vectorizer.transform(texts_test)
predictions_test = model.predict(test_vectors)
preds_default = model_default.predict(test_vectors)
print(predictions_test)
print(preds_default)

In [None]:
with open("models/LGBM_model_vectors", "wb") as model_file:
    pickle.dump(model, model_file)
with open("models/LGBM_model_vectors_default", "wb") as model_default_file:
    pickle.dump(model_default, model_default_file)
with open("models/vectorizer", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)