In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import re
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
import pickle
from multiprocessing import Pool
from tqdm import tqdm
np.random.seed(42)
random.seed(42)

In [2]:
df = pd.read_csv('train_ml.csv')

In [3]:
def get_capitalized_words(_text):
    regex_capital_letters = re.compile("([А-Я]{4,}|НЕ|ДА)")
    try:
        _found = regex_capital_letters.findall(_text)
        if len(_found) == 0:
            return []
        return _found
    except:
        return []

def get_exclamations(_text):
    regex_exclamation = re.compile("(\!+|\?+|\.{3}|\:\(|\:\)|\){2,}|\({2,})")
    try:
        _found = regex_exclamation.findall(_text)
        if len(_found) == 0:
            return []
        return _found
    except:
        return []

def words_only(_text):
    regex = re.compile("[А-Яа-яA-z]+")
    try:
        return regex.findall(_text.lower())
    except:
        return []


@lru_cache(maxsize=128)
def lemmatize_word(_token):
    pymorphy = MorphAnalyzer()
    return pymorphy.parse(_token)[0].normal_form

def lemmatize_text(_text):
    return [lemmatize_word(w) for w in _text]

def remove_stopwords(_lemmas):
    with open("stopwords_filtered", "rb") as _fp:
        _stopwords = pickle.load(_fp)
    return [w for w in _lemmas if not w in _stopwords and len(w) > 1]


def clean_text(_text):
    tokens = words_only(_text)
    _lemmas = lemmatize_text(tokens)
    return ' '.join(remove_stopwords(_lemmas))

def add_lemmas():
    with Pool(4) as p:
        lemmas = list(tqdm(p.imap(clean_text, df['feeds']), total=len(df)))
    df['lemmas'] = lemmas

def merge_lemmas():
    df['lemmas_full'] = df['lemmas'].apply(lambda x: x.split()) + df['capswords'] + df['exclamation']
    df['lemmas_full'] = df['lemmas_full'].apply(lambda x: ' '.join(x))

def add_words_counter():
    df['sym_len'] = df['feeds'].apply(len)
    df['word_len'] = df['feeds'].apply(lambda x: len(x.split()))
    df['caps_count'] = df['capswords'].apply(lambda x: len(x))
    df['exclaim_count'] = df['exclamation'].apply(lambda x: len(x))

def bank_freq():
    bank_freq = df.groupby('bank').size() / len(df)
    df.loc[:, 'bank_freq'] = df['bank'].map(bank_freq)

def drop_columns():
    df.drop(['date', 'exclamation', 'capswords', 'feeds', 'lemmas', 'bank'], axis=1, inplace=True)


df = df.dropna()
df.astype({'grades': 'int32'})
df['capswords'] = df.loc[:, 'feeds'].apply(get_capitalized_words)
df['exclamation'] = df.loc[:, 'feeds'].apply(get_exclamations)

# add_lemmas()
# merge_lemmas()
# add_words_counter()
# df.to_csv('train_lemmas_new.csv')
# drop_columns()

In [5]:
#with Pool(4) as p:
#lemmas = list(tqdm(p.imap(clean_text, df['feeds']), total=len(df)))
lemmas = df['feeds'].apply(clean_text)
#df['lemmas'] = lemmas

KeyboardInterrupt: 

In [None]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 3))
bow = vec.fit_transform(df['lemmas_full'])
joblib.dump(vec, 'vectorizer_f_new.pkl')

In [None]:
y_train = df.grades
y_train = y_train.reset_index().drop(columns='index')

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(chi2, k = 5000 )
selector.fit(bow, y_train)
X_train_sel = selector.transform(bow)
joblib.dump(selector, 'selector_f_new.pkl')