In [1]:
import re
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import TweetTokenizer
from ast import literal_eval
import numpy as np

**CONSTS**

In [2]:
VOC_SIZE = 1000

**Lodading data**

In [3]:
f=open('processed_dict.txt','r')
d=f.read()
d= literal_eval(d)


**Stemmer**

In [4]:
stemmer=RussianStemmer()
regular = re.compile('[^а-яА-Я]')
stem_cache = {}
def get_stem(token):
    stem=stem_cache.get(token, None)
    if stem:
        return stem
    token=regular.sub('',token).lower()
    stem=stemmer.stem(token)
    stem_cache[token]=stem
    return stem
    


**Vocabulary creation**

In [5]:
stem_counter=Counter()
tokenizer=TweetTokenizer()
def count_unique_tokens_in_stories(stories):
    for pair in stories.items():
        words=tokenizer.tokenize(pair[0])
        #print(words)
        for word in words:
            stem=get_stem(word)
            stem_counter[stem]+=1
        
        
        
        
count_unique_tokens_in_stories(d)
print('Unique stems number: '+ str(len(stem_counter)))

Unique stems number: 14436


In [6]:
vocabulary = sorted(stem_counter, key=stem_counter.get, reverse=True)[:VOC_SIZE]


In [12]:
word_2_ind = {vocabulary[i]:i for i in range(VOC_SIZE)}
print(word_2_ind)

{'и': 0, 'в': 1, 'я': 2, 'на': 3, 'не': 4, 'он': 5, 'что': 6, '': 7, 'с': 8, 'эт': 9, 'был': 10, 'а': 11, 'как': 12, 'так': 13, 'у': 14, 'мен': 15, 'мне': 16, 'к': 17, 'мо': 18, 'мы': 19, 'по': 20, 'за': 21, 'е': 22, 'но': 23, 'все': 24, 'когд': 25, 'ег': 26, 'то': 27, 'из': 28, 'ну': 29, 'вот': 30, 'сво': 31, 'друг': 32, 'нача': 33, 'пот': 34, 'от': 35, 'говор': 36, 'котор': 37, 'сказа': 38, 'позор': 39, 'ты': 40, 'ещ': 41, 'дом': 42, 'лет': 43, 'там': 44, 'тут': 45, 'уж': 46, 'реш': 47, 'нас': 48, 'сам': 49, 'же': 50, 'чтоб': 51, 'до': 52, 'ем': 53, 'посл': 54, 'со': 55, 'прост': 56, 'пошл': 57, 'одн': 58, 'блят': 59, 'раз': 60, 'дума': 61, 'итог': 62, 'этот': 63, 'нег': 64, 'теб': 65, 'тольк': 66, 'себ': 67, 'их': 68, 'бы': 69, 'бат': 70, 'рук': 71, 'мужик': 72, 'наш': 73, 'ест': 74, 'через': 75, 'сегодн': 76, 'пиздец': 77, 'один': 78, 'мам': 79, 'пришл': 80, 'говн': 81, 'тип': 82, 'нет': 83, 'вы': 84, 'ху': 85, 'истор': 86, 'сто': 87, 'стал': 88, 'есл': 89, 'год': 90, 'минут': 91, 

In [13]:
def to_vector(story, show_unknown=False):
    vector=np.zeros(VOC_SIZE, dtype=np.int_)
    for word in tokenizer.tokenize(story):
        stem=get_stem(word)
        idx=word_2_ind.get(stem,None)
        if idx is not None:
            vector[idx]=1
        elif show_unknown:
            print('Unknown stem '+word)
    return vector  

In [14]:

Ytmp = d.values()
Y=[]

for i in Ytmp:
    Y.append(int(i>0.05))


**Stories to vectors via bag of words**

In [15]:
#proprocess storyes. Separate good ones from bad ones
goods=[]
bads=[]
for pair in d.items():
    if pair[1]>=0.05:
        goods.append(pair[0])
    else:
        bads.append(pair[0])
print(len(goods))
print(len(bads))           
vectors = np.zeros((len(d.items()), VOC_SIZE), dtype=np.int_)
stories=[]
for i in goods:
    stories.append(to_vector(i))
for i in bads:
    stories.append(to_vector(i))
X=[]
for i in stories:
    X.append(i) 
print(len(X))
print(len(Y))

371
1628
1999
1999


**Labels**

In [16]:
labels=np.append(np.zeros(len(bads), dtype=np.int_), np.ones(len(goods), dtype=np.int_))

In [12]:
f=open('X.txt', 'wt')
f.write(str(X))
f=open('Y.txt', 'wt')
f.write(str(Y))

5997

In [20]:


X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.45)


In [21]:
print(X_train[5])

[1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [61]:
import numpy as np
import random
from math import exp

class NN:

    def __init__(self, n_input=None, n_output=None, n_hidden_nodes=None):
        self.n_input = n_input  
        self.n_output = n_output  
        self.n_hidden_nodes = n_hidden_nodes  
        self.network = self._build_network()

    def train(self, X_train, y_train, l_rate=None, n_epochs=None):

        for epoch in range(n_epochs):
            for (x, y) in zip(X_train, y_train):
                
                self._forward_pass(x)
                
                y_target = np.zeros(self.n_output, dtype=np.int)
                y_target[y] = 1
              
                self._backward_pass(y_target)
                
               
                self._update_weights(x, l_rate=l_rate)

   
    def predict(self, X):

        y_predict = np.zeros(len(X), dtype=np.int)
        for i, x in enumerate(X):
            output = self._forward_pass(x)  
            y_predict[i] = np.argmax(output)  

        return y_predict


    def _build_network(self):

        
        def _build_layer(n_input, n_output):
            layer = list()
            for idx_out in range(n_output):
                weights = list()
                for idx_in in range(n_input):
                    weights.append(random.random())
                layer.append({"weights": weights,
                              "output": None,
                              "delta": None})
            return layer

        
        n_hidden_layers = len(self.n_hidden_nodes)
        network = list()
        if n_hidden_layers == 0:
            network.append(_build_layer(self.n_input, self.n_output))
        else:
            network.append(_build_layer(self.n_input, self.n_hidden_nodes[0]))
            for i in range(1,n_hidden_layers):
                network.append(_build_layer(self.n_hidden_nodes[i-1],
                                            self.n_hidden_nodes[i]))
            network.append(_build_layer(self.n_hidden_nodes[n_hidden_layers-1],
                                        self.n_output))

        return network


    def _forward_pass(self, x):

        
        def activate(weights, inputs):
            activation = 0.0
            for i in range(len(weights)):
                activation += weights[i] * inputs[i]
            return activation

  
        input = x
        for layer in self.network:
            output = list()
            for node in layer:
                # Compute activation and apply transfer to it
                activation = activate(node['weights'], input)
                node['output'] = self._transfer(activation)
                output.append(node['output'])
            input = output

        return input

    
    def _backward_pass(self, target):

        
        n_layers = len(self.network)
        for i in reversed(range(n_layers)):
            layer = self.network[i]

            
            errors = list()
            if i == n_layers - 1:
                
                for j, node in enumerate(layer):
                    error = target[j] - node['output']
                    errors.append(error)
            else:
                
                for j, node in enumerate(layer):
                    error = 0.0
                    for node in self.network[i + 1]:
                        error += node['weights'][j] * node['delta']
                    errors.append(error)

            
            for j, node in enumerate(layer):
                node['delta'] = errors[j] * self._transfer_derivative(node['output'])

  
    def _update_weights(self, x, l_rate=0.3):

        
        for i_layer, layer in enumerate(self.network):

            
            if i_layer == 0:
                inputs = x
            else:
                inputs = np.zeros(len(self.network[i_layer - 1]))
                for i_node, node in enumerate(self.network[i_layer - 1]):
                    inputs[i_node] = node['output']

            
            for node in layer:
                for j, input in enumerate(inputs):
                    dW = l_rate * node['delta'] * input
                    node['weights'][j] += dW

  
    def _transfer(self, x):
        return 1.0/(1.0+exp(-x))

    
    def _transfer_derivative(self, transfer):
        return transfer*(1.0-transfer)

In [62]:
model = NN(n_input=VOC_SIZE, n_output=2, n_hidden_nodes=[5])
model.train(X_train, Y_train, l_rate=0.65, n_epochs=1000)
y_test_predict = model.predict(X_test)

In [63]:
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, y_test_predict))

0.658888888889


In [64]:
print(y_test_predict)

[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0
 1 1 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 1 0 0
 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 1
 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0
 0 1 1 0 0 1 0 0 1 0 1 1 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1
 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0
 0 0 1 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1
 0 0 0 0 1 0 1 0 0 0 1 1 