In [198]:
import xml.etree.ElementTree as ET
import pandas as pd
import gensim
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords
import spacy
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

## Load data into pandas DataFrame

In [177]:
class XML2DataFrame:
    def __init__(self, xml_path):
        xml_data = open(xml_path, encoding='utf-8')
        self.root = ET.XML(xml_data.read())[1]

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.attrib["name"]] = None if element.text == "NULL" else element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [178]:
train = XML2DataFrame("data/tkk_train_2016.xml").process_data().fillna(0)
train_bank = XML2DataFrame("data/bank_train_2016.xml").process_data().fillna(0)

In [179]:
test = XML2DataFrame("data/tkk_test_etalon.xml").process_data().fillna(0)
test_bank = XML2DataFrame("data/banks_test_etalon.xml").process_data().fillna(0)

## Load train and test, form object vector and labels vector

In [180]:
train_text = train['text'].values
test_text = test['text'].values

In [181]:
train_labels = train[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values
test_labels = test[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values

In [182]:
train_bank_text = train_bank['text'].values
test_bank_text = test_bank['text'].values
train_bank_labels = train_bank[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values
test_bank_labels = test_bank[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values

## Preprocess text - tokenize, delete stop-words, stem

In [183]:
stop = stopwords.words('russian')
mystem = SnowballStemmer('russian')
def preprocess_text(text):
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(text)
    tokens = [mystem.stem(token) for token in tokens if token not in stop]
    
    #text = " ".join(tokens)
    
    return tokens

In [184]:
train_ready = [preprocess_text(text) for text in train_text]
test_ready = [preprocess_text(text) for text in test_text]

In [185]:
train_bank_ready = [preprocess_text(text) for text in train_bank_text]
test_bank_ready = [preprocess_text(text) for text in test_bank_text]

## Train Word2Vec and Tfidf models on both train and test texts

In [186]:
model = gensim.models.Word2Vec(text_ready, min_count=25, size=100)
model.train(test_ready, total_examples=model.corpus_count, epochs=model.epochs)

2018-10-15 18:04:08,369 : INFO : collecting all words and their counts
2018-10-15 18:04:08,372 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 18:04:08,420 : INFO : collected 14924 word types from a corpus of 91501 raw words and 8643 sentences
2018-10-15 18:04:08,421 : INFO : Loading a fresh vocabulary
2018-10-15 18:04:08,448 : INFO : effective_min_count=25 retains 520 unique words (3% of original 14924, drops 14404)
2018-10-15 18:04:08,450 : INFO : effective_min_count=25 leaves 56280 word corpus (61% of original 91501, drops 35221)
2018-10-15 18:04:08,460 : INFO : deleting the raw counts dictionary of 14924 items
2018-10-15 18:04:08,462 : INFO : sample=0.001 downsamples 58 most-common words
2018-10-15 18:04:08,464 : INFO : downsampling leaves estimated 36859 word corpus (65.5% of prior 56280)
2018-10-15 18:04:08,469 : INFO : estimated required memory for 520 words and 100 dimensions: 676000 bytes
2018-10-15 18:04:08,471 : INFO : resetting layer we

(51072, 130195)

In [187]:
model_bank = gensim.models.Word2Vec(train_bank_ready, min_count=25, size=100)
model_bank.train(test_bank_ready, total_examples=model.corpus_count, epochs=model.epochs)

2018-10-15 18:04:09,657 : INFO : collecting all words and their counts
2018-10-15 18:04:09,660 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-15 18:04:09,710 : INFO : collected 15035 word types from a corpus of 96267 raw words and 9392 sentences
2018-10-15 18:04:09,712 : INFO : Loading a fresh vocabulary
2018-10-15 18:04:09,733 : INFO : effective_min_count=25 retains 371 unique words (2% of original 15035, drops 14664)
2018-10-15 18:04:09,737 : INFO : effective_min_count=25 leaves 65977 word corpus (68% of original 96267, drops 30290)
2018-10-15 18:04:09,744 : INFO : deleting the raw counts dictionary of 15035 items
2018-10-15 18:04:09,748 : INFO : sample=0.001 downsamples 42 most-common words
2018-10-15 18:04:09,751 : INFO : downsampling leaves estimated 29880 word corpus (45.3% of prior 65977)
2018-10-15 18:04:09,757 : INFO : estimated required memory for 371 words and 100 dimensions: 482300 bytes
2018-10-15 18:04:09,759 : INFO : resetting layer we

(60828, 193745)

In [188]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
vect.fit([' '.join(text) for text in train_ready])
vect.fit([' '.join(text) for text in test_ready])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [189]:
vect_bank = TfidfVectorizer()
vect_bank.fit([' '.join(text) for text in train_bank_ready])
vect_bank.fit([' '.join(text) for text in test_bank_ready])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Train logistic regression on tfidf

In [190]:
X_train = vect.transform([' '.join(text) for text in train_ready])
X_test = vect.transform([' '.join(text) for text in test_ready])

In [191]:
lr = LogisticRegression().fit(X_train, train_labels)
pred_labels = lr.predict(X_test)
'Accuracy on tfidf for tkk: ', accuracy_score(test_labels, pred_labels)

('Accuracy on tfidf for tkk: ', 0.6386292834890965)

In [200]:
X_bank_train = vect_bank.transform([' '.join(text) for text in train_bank_ready])
X_bank_test = vect_bank.transform([' '.join(text) for text in test_bank_ready])

In [201]:
lr_bank = LogisticRegression().fit(X_bank_train, train_bank_labels)
pred_bank_labels = lr_bank.predict(X_bank_test)
'Accuracy on tfidf for bank: ', accuracy_score(test_bank_labels, pred_bank_labels)

('Accuracy on tfidf for bank: ', 0.7398128584364624)

## Create feature matrix. Each text is a sum of word vectors weighted by idf coefficient.

In [211]:

def get_features_of_text(text, feature_names, model):
    result_vect = np.zeros(model.layer1_size)
    for word in text:
        try:
            result_vect += model.wv[word]*vect.idf_[feature_names.index(word)]
        except:
            pass
    return result_vect

In [212]:
feature_names = vect.get_feature_names()
X_train = np.array([get_features_of_text(text, feature_names, model) for text in train_ready])
X_text = np.array([get_features_of_text(text, feature_names, model) for text in test_ready])

  This is separate from the ipykernel package so we can avoid doing imports until


In [213]:
feature_names = vect_bank.get_feature_names()

X_bank_train = np.array([get_features_of_text(text, feature_names, model_bank) for text in train_bank_ready])
X_bank_text = np.array([get_features_of_text(text, feature_names, model_bank) for text in test_bank_ready])

  This is separate from the ipykernel package so we can avoid doing imports until


## Fit logistic regression, estimate quality

In [214]:
lr = LogisticRegression()
lr.fit(X_train, train_labels)
y_pred = lr.predict(X_text)
'Accuracy on word2vec-idf for tkk: ', accuracy_score(y_pred, test_labels)

('Accuracy on word2vec-idf for tkk: ', 0.5763239875389408)

In [215]:
lr_bank = LogisticRegression()
lr_bank.fit(X_bank_train, train_bank_labels)
y_bank_pred = lr_bank.predict(X_bank_text)
'Accuracy on word2vec-idf for bank: ', accuracy_score(y_bank_pred, test_bank_labels)

('Accuracy on word2vec-idf for bank: ', 0.6939329912466042)