### **Импорт библиотек**

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
!pip install catboost
!pip install sktime
!pip install tqdm
!pip install pymorphy2[fast]



In [30]:
import numpy as np
import pandas as pd
import re

from collections import Counter
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sktime.transformations.panel.rocket import MiniRocket
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import RidgeClassifier

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
from pymorphy2 import MorphAnalyzer

import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **Считываем и смотрим на данные**

In [31]:
labled_train_data = pd.read_csv('/content/drive/MyDrive/labled_train_data.c sv', comment='#', sep='\t').drop('Unnamed: 0', axis=1)
labled_train_comments = pd.read_csv('/content/drive/MyDrive/labled_train_comments.csv', comment='#', sep='\t').drop('Unnamed: 0', axis=1)


### **Модель**

In [32]:
X, y = labled_train_data[labled_train_data.columns[:-1]], labled_train_data['is_aggressive']

In [91]:
class Model:
  def __init__(self):
    self.model = None
    self.aggressive_words = {'verb': set(), 'adj': set(), 'all_words': set(), 'noun': set()}
    self.morph_analyzer = MorphAnalyzer()

  # возвращает нормальную форму слова и его тэг(характеристики слова)
  def lemmatize_and_word_class(self, word):
      w = self.morph_analyzer.parse(word)[0]
      return w.normal_form, w.tag

  # добавляет слова в словарь агрессивных слов
  def make_agressive_vocab(self, X, y):
    stop_words = set(stopwords.words('russian')) # стоп-слова из nltk
    dataset = X.join(y).copy()
    for row in dataset.itertuples(): # перебираем все строки в датасете
      if getattr(row, 'is_aggressive') == 1 and getattr(row, 'comment'):
        words = [self.lemmatize_and_word_class(re.sub(r'\W', '', word)) for word in getattr(row, 'comment').split(' ')]
        for word in words:
          if word[0] not in stop_words:
            if 'VERB' in word[1]:
              self.aggressive_words['verb'].add(word[0])
            elif 'ADJF' in word[1] or 'ADJS' in word[1]:
              self.aggressive_words['adj'].add(word[0])
            elif 'NOUN' in word[1]:
              self.aggressive_words['noun'].add(word[0])
            self.aggressive_words['all_words'].add(word[0])


  # делаем NLP фичи на основе сгенерированного словаря
  def NLP_feature_extract(self, X, y=None):

    morph_analyzer = MorphAnalyzer()
    stop_words = set(stopwords.words('russian')) # стоп-слова из nltk
    agg_verbs_rate = [] # глаголы
    agg_adjs_rate = [] # прилагательные
    agg_nouns_rate = []
    for row in X.itertuples(): # перебираем все строки в датасете
      if getattr(row, 'comment'):
        words = [self.lemmatize_and_word_class(re.sub(r'\W', '', word)) for word in getattr(row, 'comment').split(' ')]
        words_verb = [word[0] for word in words if 'VERB' in word[1] and word[0] not in stop_words]
        words_adj = [word[0] for word in words if ('ADJF' in word[1] or 'ADJS' in word[1]) and word[0] not in stop_words]
        words_noun = [word[0] for word in words if 'NOUN' in word[1] and word[0] not in stop_words]
        if len(words_verb) > 0:
          agg_verbs_rate.append(len(set(words_verb) & self.aggressive_words['verb']) / len(words_verb))
        else:
          agg_verbs_rate.append(0)
        if len(words_adj) > 0:
          agg_adjs_rate.append(len(set(words_adj) & self.aggressive_words['adj']) / len(words_adj))
        else:
          agg_adjs_rate.append(0)
        if len(words_noun) > 0:
          agg_nouns_rate.append(len(set(words_noun) & self.aggressive_words['noun']) / len(words_noun))
        else:
          agg_nouns_rate.append(0)
    return agg_verbs_rate, agg_adjs_rate, agg_nouns_rate

  # отбор фичей
  def features(self, X):
    data = X.copy()
    agg_verbs_rate, agg_adjs_rate, agg_nouns_rate = self.NLP_feature_extract(data)
    data['agg_verbs_rate'] = agg_verbs_rate
    data['agg_adjs_rate'] = agg_adjs_rate
    data['agg_nouns_rate'] = agg_nouns_rate
    feature_list = ['arrived_distance', 'arrived_duration', 'distance', 'duration', 'from_latitude', 'from_longitude', 'to_latitude', 'to_longitude', 'client_rate_ride', 'client_rides_cnt', 
                    'driver_rides_cnt', 'agg_verbs_rate', 'agg_adjs_rate', 'agg_nouns_rate']
    for feature in feature_list:
      data = data.fillna({feature: data[feature].mean()})
    return data[feature_list]

  # кросс-валидация и предикт на тесте
  def train_eval(self, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    self.make_agressive_vocab(X_train, y_train)

    self.model = LogisticRegression(random_state=42)

    cv_score = cross_val_score(self.model, self.features(X_train), y_train, cv=5, scoring='roc_auc')
    print(f"CV_mean: {np.mean(cv_score)}, CV_std: {np.std(cv_score)}")

    self.model.fit(self.features(X_train), y_train)
    print(roc_auc_score(y_test, self.model.predict_proba(self.features(X_test))[:, 1]))
  
  def predict(self, X):
    return self.model.predict(X)

In [92]:
model = Model()
model.train_eval(X, y)

CV_mean: 0.7213110197676796, CV_std: 0.036754591689793115
0.6965943062584263


### **Отсчет по фичам**

Скор без NLP фич - 0.5644
Скор с NLP фичами:

  - эмпирически: 0.6965 (если добавить все слова, а не по типу слов - результат хуже на 0.005)

[0.67311508 0.54761905 0.58581349 0.63194444 0.62690005] только глаголы и прилагательные

[0.67460317 0.54513889 0.57242063 0.62400794 0.61953017] все слова

[0.66617063 0.54662698 0.56150794 0.67311508 0.59465684] глаг+прил, но учитываем наречия как прилагательные

CV_mean: 0.7213110197676796, CV_std: 0.036754591689793115 - глаг+прил+нареч

  