In [28]:
# matchers
f_matcher = lambda x: x.Пол == 'ж'
m_matcher = lambda x: x.Пол == 'м'        

yl18_matcher = lambda x: x.Год > 2000
y18_27_matcher = lambda x: 1991 <= x.Год <= 2000
y28_40_matcher = lambda x: 1978 <= x.Год <= 1991
yg40_matcher = lambda x: x.Год > 1978

no_level_matcher = lambda x: x.Профессия == '-'
teacher_matcher = lambda x:  x.Профессия == 'учитель'
writer_matcher = lambda x:   x.Профессия == 'писатель'

no_mentality_mantcher = lambda x: x.гумтех == 'без'
humanist_matcher = lambda x: x.гумтех == 'гум'
techie_matcher = lambda x:   x.гумтех == 'тех'


tag_dict = {
    'м': {
        'file':'males.txt',
        'matcher':m_matcher,
        'value':[0]
    },
    'ж': {
        'file':'femalies.txt',
        'matcher':f_matcher,
        'value':[1]
    },
    'моложе_18': {
        'file':'younger_18.txt',
        'matcher':yl18_matcher,
        'value': [0,0,0,1]
    },
    '18_27': {
        'file':'from_18_to_27.txt',
        'matcher':y18_27_matcher,
        'value':[0,0,1,0]
    },
    '28_40': {
        'file':'from_27_to_40.txt',
        'matcher':y28_40_matcher,
        'value':[0,1,0,0]
    },
    'старше_40': {
        'file':'older_41.txt',
        'matcher':yg40_matcher,
        'value':[1,0,0,0]
    },
    'неизв_проф': {
        'file':'unknown_level.txt',
        'matcher':no_level_matcher,
        'value':[0,0,1]
    },
    'учитель': {
        'file':'teachers.txt',
        'matcher':teacher_matcher,
        'value0':[0,1,0]
    },
    'писатель': {
        'file':'writers.txt',
        'matcher':writer_matcher,
        'value':[1,0,0]
    },
    'неизв_склад': {
        'file':'unknown_mentality.txt',
        'matcher':no_mentality_mantcher,
        'value':[0,0,1]
    },
    'гуманитарий': {
        'file':'humanists.txt',
        'matcher':humanist_matcher,
        'value':[0,1,0]
    },
    'технарь': {
        'file':'techies.txt',
        'matcher':techie_matcher,
        'value':[1,0,0]
    },
    
}

tags = ['м','ж']


In [29]:
# Создаем dataset
import numpy as np
import nltk
import pymorphy2
from math import log
from re import sub

morph = pymorphy2.MorphAnalyzer()
rutokenizer = nltk.data.load('russian.pickle')

# utils
def total_sum_word(sentences):
    total_sum = 0
    total_word = 0
    for s in sentences:
        total_sum += sum(len(word) for word in s)
        total_word += len(s)
    return total_sum, total_word

def morph_sentence(sentence):
    clearSentence = sub(r'[\.\—\?\!\n\t\,\«\»\)\(\:\/\=]',' ',sentence)
    return [morph.parse(word.lower())[0].normal_form for word in clearSentence.split()]


# Выделяем признаки
def review_length(sentences):
    total_sum, total_word = total_sum_word(sentences)
    return log(total_word)

def avg_word_len(sentences):
    total_sum, total_word = total_sum_word(sentences)
    return  total_sum / total_word

def avg_sent_len(sentences):
    total_sum, total_word = total_sum_word(sentences)
    return total_word / len(sentences)

def avg_use_of(words, sentences):
    use_count = 0
    total_sum, total_word = total_sum_word(sentences)
    for sent in sentences:
        msent = morph_sentence(sent)
        for w in words:
            if w in msent:
                use_count += 1
    return use_count / (1 + total_word)

def get_flesch_readability_score(sentences):
    def nsyl(word):
        vowels = 'аоиеёэыуюя'
        lowercase = word.lower()
        return max(1, len([x for x in lowercase if x in vowels]))
    def avg(list_list_int):
        total_syl = 0
        total_words = 0
        for list_int in list_list_int:
            total_words += len(list_int)
            total_syl += sum(list_int)
        return total_syl / total_words, total_words / len(list_list_int)
    list_out = []
    for s in sentences:
        list_in = []
        for w in s:
            list_in.append(nsyl(w))
        list_out.append(list_in)
    W, S = avg(list_out)
    K= 206.836 - 65.14 * W - 1.52 * S
    return K

def get_feats(review):
    sentences = rutokenizer.tokenize(review)
    return [avg_word_len(sentences), 
            review_length(sentences),
            avg_use_of('не', sentences),
            get_flesch_readability_score(sentences)]



In [None]:
from keras.models import Sequential
from keras.layers import Dense
import codecs
import pprint

pp = pprint.PrettyPrinter(indent=4)

# задаем для воспроизводимости результатов
numpy.random.seed(2)

# генерируем датасет
filecontents = []
for tag in tags:
    with codecs.open('splitted_corpora/' + tag_dict[tag]['file'], 'r', encoding='utf-8') as f:
        filecontents.append(f.read().splitlines())

min_length = min(min(map(len, filecontents)), 100)
print('min_length ' + str(min_length))
train_data = []
label_data = []
for idx, content in enumerate(filecontents):
    label = tag_dict[tags[idx]]['value']
    for row in range(0, min_length):
        feats = get_feats(content[row])
        train_data.append(feats)
        label_data.append(label)


# создаем модели, добавляем слои один за другим
model = Sequential()
model.add(Dense(12, input_dim=4, activation='relu')) # входной слой требует задать input_dim
model.add(Dense(15, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # сигмоида вместо relu для определения вероятности

print('COMPILE')
# компилируем модель, используем градиентный спуск adam
model.compile(loss="binary_crossentropy", 
              optimizer="adam", metrics=['accuracy'])

# обучаем нейронную сеть
NPARR_TRAINING_DATA = np.array(train_data)
NPARR_LABEL_DATA = np.array(label_data)

model.fit(NPARR_TRAINING_DATA, NPARR_LABEL_DATA, 
          epochs = 1000, batch_size=10)

# оцениваем результат
scores = model.evaluate(X, Y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

min_length 100
COMPILE
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 