In [None]:
import pandas as pd
import numpy as np
from nltk.stem.porter import *

In [None]:
data = pd.read_csv('data_all.csv')
emoji_stats = [
    ('👍', 14189),
    ('\U0001fae1', 76),
    ('🙏', 157),
    ('🔥', 5659),
    ('🥰', 1186),
    ('👌', 69),
    ('🤓', 58),
    ('🤝', 3),
    ('👎', 2049),
    ('🤬', 182),
    ('😁', 2405),
    ('🤡', 1453),
    ('🏆', 290),
    ('👏', 320),
    ('💯', 160),
    ('❤', 4263),
    ('🌚', 254),
    ('🤨', 293),
    ('🥴', 274),
    ('🤩', 238),
    ('😍', 64),
    ('🤣', 348),
    ('😢', 2800),
    ('💩', 1169),
    ('🤯', 598),
    ('❤\u200d🔥', 596),
    ('🐳', 457),
    ('🤮', 1203),
    ('🤗', 2),
    ('😇', 20),
    ('🤔', 964),
    ('🖕', 114),
    ('🥱', 93),
    ('😈', 213),
    ('🕊', 118),
    ('🍌', 476),
    ('🌭', 234),
    ('💋', 219),
    ('⚡', 83),
    ('🍓', 113),
    ('🍾', 287),
    ('💔', 38),
    ('😱', 442),
    ('🎉', 731),
    ('😐', 76),
    ('✍', 34),
    ('😭', 116),
    ('🆒', 31),
    ('🗿', 7),
    ('👀', 48),
    ('💅', 6),
    ('🎄', 66),
    ('☃', 3),
    ('👨\u200d💻', 2),
    ('👻', 7),
    ('🙊', 1),
    ('🤪', 4),
    ('😨', 2),
    ('💊', 2),
    ('😴', 2),
]
class_weights = {}
for emoji_id in range(60):
    class_weights[emoji_id] = emoji_stats[emoji_id][1]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
stemmer = PorterStemmer()
texts = data['text'].str.replace('[^А-я]', ' ', regex=True).str.lower()
texts = [' '.join([stemmer.stem(word) for word in text.split()]) for text in texts]

In [None]:
train_vectors = vectorizer.fit_transform(texts)

In [None]:
import lightgbm
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

In [None]:
target = data.drop(['text', 'Unnamed: 52', 'total'], axis=1)
target_np = np.argmax(np.array(target), axis=1)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_vectors, target_np, test_size=0.2)

In [None]:
bad_ids = []
keys = class_weights.keys()
for emoji_index in keys:
    if sum(y_train == emoji_index) == 0:
        bad_ids.append(emoji_index)
for emoji_id in bad_ids:
    del class_weights[emoji_id]

In [None]:
n_estimators = 25
learning_rate = 0.03
model = lightgbm.LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, class_weight=class_weights)
model_default = lightgbm.LGBMClassifier(class_weight='balanced')
y_train_default = (y_train == 0)
model.fit(X_train, y_train)
model_default.fit(X_train, y_train_default)
preds = model.predict(X_test)
preds_default = model_default.predict(X_test)
preds_result = preds == 0
true_result = y_test == 0
print(accuracy_score(preds, y_test), recall_score(preds_result, true_result))

In [None]:
texts_test = pd.Series([
    'тестовое сообщение'
])
texts_test = texts_test.str.replace('[^А-я]', ' ', regex=True).str.lower()
texts_test = [' '.join([stemmer.stem(word) for word in text.split()]) for text in texts_test]

In [None]:
test_vectors = vectorizer.transform(texts_test)
predictions_test = model.predict(test_vectors)
preds_default = model_default.predict(test_vectors)
print(list(map(lambda x:target.columns[x],predictions_test)))
print(preds_default)

In [None]:
model_file = open('models/LGBM_model_vectors', 'wb')
model_default_file = open('models/LGBM_model_vectors_default', 'wb')
vectorizer_file = open('models/vectorizer', 'wb')
import pickle
pickle.dump(model, model_file)
pickle.dump(model_default, model_default_file)
pickle.dump(vectorizer, vectorizer_file)
model_file.close()
model_default_file.close()
vectorizer_file.close()