In [2]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, learning_curve, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, f1_score, accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nltk 
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Preparing

In [3]:
# train dataset
df = pd.read_csv("datasets/hse_data_science_hack/train.csv", index_col=[0])

# lemmatized sentences
with open("corpus_lemmatized.txt", "r", encoding="utf-8") as f:
    lemmatized = f.readlines()

### Data Preprocessing

In [None]:
stop_words = stopwords.words("russian")

In [None]:
df["sentence_lemmatized"] = [x.split(" ") for x in lemmatized]
df["sentence_lemmatized"] = [" ".join([word.strip() for word in sent if word not in stop_words]) for sent in df["sentence_lemmatized"].values]

### Target Encoding (Categories)

In [None]:
df = df[df["1category"] != "?"]
df["1category"].value_counts()

In [None]:
y_categories = []
for x, y in zip(df["1category"].values, df["2category"].values):
    if y in df["2category"].value_counts():
        y_categories.append([x, y])
    else:
        y_categories.append([x])

In [None]:
mlb = MultiLabelBinarizer()
categories_encoded = mlb.fit_transform(y_categories)
mlb.classes_

### Vectorizing

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df["sentence_lemmatized"], categories_encoded, test_size=0.1, stratify=categories_encoded)

In [None]:
vectorizer = TfidfVectorizer()
train_x_vct = vectorizer.fit_transform(train_x)
test_x_vct = vectorizer.transform(test_x)

### Experiments

**При каком количестве деревьев лучший результат?**

In [None]:
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
estimators_grid = [5, 10, 15, 20, 30, 50, 75, 100, 130]

for n in tqdm(estimators_grid):
    model = OneVsRestClassifier(RandomForestClassifier(n_estimators=n))
    temp_train_acc = []
    temp_test_acc = []
    model.fit(train_x_vct, train_y)
    temp_train_acc.append(model.score(train_x_vct, train_y))
    temp_test_acc.append(model.score(test_x_vct, test_y))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)

train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print('Best score on cross-validation:', max(test_acc.mean(axis=1)))
print('Best performance:', estimators_grid[np.argmax(test_acc.mean(axis=1))])

plt.grid()
plt.plot(estimators_grid, train_acc.mean(axis=1), 'o-', color='#f4777f', label='Training accuracy')
plt.plot(estimators_grid, test_acc.mean(axis=1), 'o-', color='#7eb19c', label='Cross-validation accuracy')
plt.fill_between(estimators_grid, train_acc.mean(axis=1) - train_acc.std(axis=1), train_acc.mean(axis=1) + train_acc.std(axis=1), color='#f4777f', alpha=0.1)
plt.fill_between(estimators_grid, test_acc.mean(axis=1) - test_acc.std(axis=1), test_acc.mean(axis=1) + test_acc.std(axis=1), color='#7eb19c', alpha=0.1)
plt.ylabel('Score')
plt.xlabel('Number of estimators')
plt.legend(loc='best')
plt.show()

Наилучший скор при n_estimators=300.

In [None]:
model = OneVsRestClassifier(RandomForestClassifier(n_estimators=300))
model.fit(train_x_vct, train_y)

In [None]:
import pickle
model_path = "/home/jupyter/mnt/s3/cheliki/categories_predictor/rfc.pickle"
with open(model_path, "wb") as f:
    pickle.dump(model, f)

In [None]:
with open(model_path, "rb") as f:
    model = pickle.load(f)

Считаем метрику ROC_AUC

In [None]:
test_pred_proba = model.predict_proba(test_x_vct)
roc_auc_score(test_y, test_pred_proba)

Инференес для submission

In [None]:
import string
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

def string_preparation(text):
    translating = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translating)
    # Lemmatize words in the text
    lemmas = [morph.parse(word)[0].normal_form for word in text_without_punctuation.split()]
    lemmatized_text = ' '.join(lemmas)
    return lemmatized_text

test_prepared = [string_preparation(text) for text in test_df["texts"].values]

tokenized = [x.split(" ") for x in test_prepared]
preprocessed = []
for sent in tokenized:
    preprocessed.append(" ".join([word for word in sent if word not in stop_words]))
    
test_vct = vectorizer.transform(preprocessed)
predictions_proba = model.predict_proba(test_vct)

communication = []
quality = []
price = []
safety = []
second_category = []
for x in predictions_proba:
    communication.append(x[mlb.classes_.tolist().index("Communication")])
    quality.append(x[mlb.classes_.tolist().index("Quality")])
    price.append(x[mlb.classes_.tolist().index("Price")])
    safety.append(x[mlb.classes_.tolist().index("Safety")])
    if x[np.argsort(x)[-2]] > 0.4:
        second_category.append(1)
    else:
        second_category.append(0)