In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import gensim
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords
import spacy
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from gensim.models import FastText

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


## Load data into pandas dataframe

In [2]:
class XML2DataFrame:
    def __init__(self, xml_path):
        xml_data = open(xml_path, encoding='utf-8')
        self.root = ET.XML(xml_data.read())[1]

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.attrib["name"]] = None if element.text == "NULL" else element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [3]:
train = XML2DataFrame("data/tkk_train_2016.xml").process_data().fillna(0)
train_bank = XML2DataFrame("data/bank_train_2016.xml").process_data().fillna(0)
test = XML2DataFrame("data/tkk_test_etalon.xml").process_data().fillna(0)
test_bank = XML2DataFrame("data/banks_test_etalon.xml").process_data().fillna(0)

## Load train and test, form object vector and labels vector

In [4]:
train_text = train['text'].values
test_text = test['text'].values
train_labels = train[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values
test_labels = test[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values

In [5]:
def norm_labels(labels):
    for i in range(len(labels)):
        if labels[i]>=1:
            labels[i] = 1
        if labels[i]<=-1:
            labels[i] = -1
    return labels

train_labels = norm_labels(train_labels)
test_labels = norm_labels(test_labels)

In [6]:
set(train_labels)

{-1, 0, 1}

In [7]:
train_bank_text = train_bank['text'].values
test_bank_text = test_bank['text'].values
train_bank_labels = train_bank[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values
test_bank_labels = test_bank[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values

In [8]:
train_bank_labels = norm_labels(train_bank_labels)
test_bank_labels = norm_labels(test_bank_labels)

## Preprocess text - tokenize, delete stop-words, stem

In [9]:
stop = stopwords.words('russian')
mystem = SnowballStemmer('russian')
def preprocess_text(text):
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(text)
    tokens = [mystem.stem(token) for token in tokens if token not in stop]
    
    text = " ".join(tokens)
    
    return text

In [10]:
train_ready = [preprocess_text(text) for text in train_text]
test_ready = [preprocess_text(text) for text in test_text]
train_bank_ready = [preprocess_text(text) for text in train_bank_text]
test_bank_ready = [preprocess_text(text) for text in test_bank_text]

## Convolutional network
for ttk

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation


make data ready for network

In [12]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_ready)

x_train = tokenizer.texts_to_sequences(train_ready)
x_test = tokenizer.texts_to_sequences(test_ready)

X_train = pad_sequences(x_train, maxlen=100)
X_test = pad_sequences(x_test, maxlen=100)

word_index = tokenizer.word_index

In [13]:
X_train.shape

(8643, 100)

In [14]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

In [15]:
y_train.shape

(8643, 3)

create and train model

In [16]:
model = Sequential()

model.add(Embedding(10000, 1000, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=15)

Epoch 1/15


Epoch 2/15




Epoch 3/15


Epoch 4/15




Epoch 5/15


Epoch 6/15




Epoch 7/15


Epoch 8/15




Epoch 9/15


Epoch 10/15




Epoch 11/15


Epoch 12/15




Epoch 13/15


Epoch 14/15




Epoch 15/15




<keras.callbacks.History at 0x1bf56d575f8>

In [17]:
y_pred = model.predict_classes(X_test) - 1



In [18]:
accuracy_score(y_pred, test_labels)

0.6497552291944815

## Convolutional model with pretrained fasttext vectors
for ttk

prepare weights matrix for embedding layer

In [16]:
from gensim.models import FastText

emb_model = FastText.load('data/fasttext/araneum_none_fasttextskipgram_300_5_2018.model')

In [17]:
word_vectors = emb_model.wv

In [51]:
EMBEDDING_DIM = 300
NUM_WORDS = 20000
vocabulary_size = len(word_index)+1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

prepare and train model

In [52]:
model = Sequential()

model.add(Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=True, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=5)

Epoch 1/5


Epoch 2/5




Epoch 3/5


Epoch 4/5




Epoch 5/5




<keras.callbacks.History at 0x22557a74630>

In [53]:
y_pred = model.predict_classes(X_test) - 1
accuracy_score(y_pred, test_labels)



0.6853582554517134

## Networks for bank

In [18]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_bank_ready)

x_bank_train = tokenizer.texts_to_sequences(train_bank_ready)
x_bank_test = tokenizer.texts_to_sequences(test_bank_ready)

X_bank_train = pad_sequences(x_bank_train, maxlen=100)
X_bank_test = pad_sequences(x_bank_test, maxlen=100)

word_bank_index = tokenizer.word_index

encoder = LabelBinarizer()
encoder.fit(train_bank_labels)
y_bank_train = encoder.transform(train_bank_labels)
y_bank_test = encoder.transform(test_bank_labels)

model without pretrained vectors

In [56]:
model = Sequential()

model.add(Embedding(10000, 1000, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])

model.fit(X_bank_train, y_bank_train, batch_size=32, epochs=5)

Epoch 1/5


Epoch 2/5




Epoch 3/5


Epoch 4/5




Epoch 5/5




<keras.callbacks.History at 0x2249a7247b8>

In [57]:
y_bank_pred = model.predict_classes(X_bank_test) - 1
accuracy_score(y_bank_pred, test_bank_labels)



0.6978569272562632

with pretrained

In [19]:
EMBEDDING_DIM = 300
NUM_WORDS = 20000
vocabulary_size = len(word_bank_index)+1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_bank_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [20]:
model = Sequential()

model.add(Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=True, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_bank_train, y_bank_train, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5


Epoch 3/5


Epoch 4/5


Epoch 5/5




<keras.callbacks.History at 0x2e787984eb8>

In [21]:
y_bank_pred = model.predict_classes(X_bank_test) - 1
accuracy_score(y_bank_pred, test_bank_labels)



0.685481436764262