In [11]:
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from time import time
import pymorphy2
import re
import pickle
import random
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [None]:
class GuruBot:
    def __init__(self,
                 model_path, 
                 data_paths, 
                 morph, 
                 botfile = 'gurubot',
                 pos_map={
                    'NOUN': '_NOUN',
                    'VERB': '_VERB', 
                    'INFN': '_VERB',
                    'GRND': '_VERB', 
                    'PRTF': '_VERB', 
                    'PRTS': '_VERB',
                    'ADJF': '_ADJ', 
                    'ADJS': '_ADJ',
                    'ADVB': '_ADV',
                    'PRED': '_ADP',
                    'NUMR': '_NUM'
                }, start = [
                    'Учитель', 
                    'Мастер', 
                    'Мудрец', 
                    'Философ'
                ], middle = [
                    ' ', 
                    ' с улыбкой ', 
                    ' ', 
                    ', подумав, ', 
                    ', помолчав, ', 
                    ' тихо ', 
                    ' уверено ', 
                    ' ', 
                    ' поучительно '
                ], end = [
                    'отвечает', 
                    'произносит', 
                    'говорит', 
                    'объясняет'
                ], smiles = [
                     '☺', '☻', '✌', '☹', '♡', '♥', 
                     '❤', '⚘', '❀', '❃', '❁', '✼', 
                     '☀', '✌', '♫', '♪', '☃', '❄', 
                     '❅', '❆', '☕', '☂', '★', '💋', 
                     '◕‿◕', '｡◕‿◕｡', '｡◕‿‿◕｡', '^̮^',
                     '(◕‿◕)', '(｡◕‿◕｡)', '(｡◕‿‿◕｡)', 
                     '(^̮^)', 'ʘ‿ʘ', 'ಠ_ಠ', 'ಠ‿ಠ', '(ʘ‿ʘ)',
                     '(ಠ_ಠ)', '(ಠ‿ಠ)', '♥‿♥', '⊙﹏⊙', 
                     '(¬_¬)', '◕‿↼', '(¬‿¬)', '◔ ⌣ ◔', 
                     '(｡◕‿‿◕｡)', '¯\_(ツ)_/¯', '(° ͜ʖ °)', 
                     '¯\(°_o)/¯', '(︺︹︺)'
                 ]):
        self.model = KeyedVectors.load_word2vec_format(model_path, encoding='utf-8')
        self.morph = morph
        self.botfile = '_'.join([botfile, model_path.split('.')[0], str(int(time()))])
        self.pos_map = pos_map
        self.library = self.build_library(data_paths)
        self.intros = [s+m+e+':' for s in start for m in middle for e in end]
        self.smiles = smiles + ['']*len(smiles)*6
        self.export()
    
    
    def load_data(self, data_paths):
        data = []
        for path in data_paths:
            with open(path, encoding='utf-8') as f:
                data += f.read().split('\n')    
        return data
    
    
    def export(self, path='botfiles/', fname=''):
        fname = fname if fname else self.botfile
        with open(path+fname+'.pkl', 'wb') as f:
            pickle.dump(self, f)
        
        
    def cleanse(self, string):
        rgxp = '[\`\)\(\|©~^<>/\'\"\«№#$&\*.,;=+?!\—_@:\]\[%\{\}0-9A-Za-z\\n]'
        return re.sub(' +', ' ', re.sub(rgxp, ' ', string.lower()))

    
    def lemmatize(self, string, protected=[]):
        return [self.morph.parse(word)[0].normal_form \
                for word in word_tokenize(self.cleanse(string))]


    def map_pos(self, pos):
        return self.pos_map[pos] if pos in self.pos_map else "_X"


    def make_bag(self, string):
        pos_words = [word + self.map_pos(str(self.morph.parse(word)[0].tag.POS)) \
                     for word in self.lemmatize(string)]
        return [w for w in pos_words if w in self.model.vocab]
    
    
    def build_library(self, data_paths):
        return [(self.make_bag(string), string) \
                for string in self.load_data(data_paths)]


    def similarity(self, bag1, bag2, rand_coef=0.01):
        try: 
            res = sum([self.model.similarity(i, j) for i in bag1 for j in bag2])
            res /= (len(bag1) * len(bag2))
        except:
            res = 0
        return res + random.uniform(0, rand_coef)


    def find_reply(self, inp):
        similar_answers = [(self.similarity(inp, answer[0]), answer[1])\
                           for answer in self.library]
        similar_answers.sort(key=lambda x:x[0], reverse=True)
        return similar_answers[0][1]


    def make_surface(self, reply):
        intro = random.choice(self.intros) + " " if self.intros else ""
        phrase = intro + reply[0].lower() + reply[1:] if intro else reply
        return phrase + ' ' + random.choice(self.smiles)


    def answer(self, inp):
        inp_bag = self.make_bag(inp)
        return self.make_surface(self.find_reply(inp_bag))

In [2]:
class SurfaceRealizer:
    def __init__(self):
        pass
    
    def make_surface(self):
        pass
    

In [106]:
%%time
guru = GuruBot(
    model_path = 'ruscorpora_upos_skipgram_300_5_2018.vec.gz', 
    data_paths = [
        'twen.txt', 
        'proverbs.txt', 
        'zavets.txt', 
        'confuc.txt'
    ],
    morph = pymorphy2.MorphAnalyzer()
)

2018-06-19 22:43:53,212 : INFO : Loading dictionaries from c:\users\mytas\appdata\local\programs\python\python36\lib\site-packages\pymorphy2_dicts\data
2018-06-19 22:43:53,321 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2018-06-19 22:43:53,325 : INFO : loading projection weights from ruscorpora_upos_skipgram_300_5_2018.vec.gz
2018-06-19 22:45:32,087 : INFO : loaded (195071, 300) matrix from ruscorpora_upos_skipgram_300_5_2018.vec.gz


Wall time: 1min 49s


In [114]:
while True: 
    i = input("...: ")
    if i.lower() == 'стоп': break 
    print(guru.answer(i))

...: Привет, друг
Учитель уверено говорит: друзья наших друзей - наши друзья ☻
...: Ты мне нравишься
Мастер объясняет: что нравится, то и прекрасно. 
...: Скажи мне, когда уже будет обед?
Мудрец с улыбкой говорит: и скажешь - плохо, и не скажешь - плохо. 
...: Да почему же ты так думаешь?
Мудрец объясняет: что думает, то и говорит. 
...: Ты дурак
Философ, помолчав, говорит: при деньгах-то и дурак умный.  ｡◕‿‿◕｡
...: А у тебя что, денег много?
Мудрец, помолчав, отвечает: с деньгами мил, без денег постыл.  
...: Вот именно. Сидим целый месяц и ждем зарплаты.
Мудрец объясняет: дома сидеть - ничего не высидеть. 
...: стоп


In [75]:
%%time
guru_ar = GuruBot(
    model_path = 'araneum_upos_skipgram_300_2_2018.vec.gz', 
    data_paths = [
        'twen.txt', 
        'proverbs.txt', 
        'zavets.txt', 
        'confuc.txt'
    ],
    morph = pymorphy2.MorphAnalyzer()
)

2018-06-17 05:46:13,114 : INFO : Loading dictionaries from c:\users\mytas\appdata\local\programs\python\python36\lib\site-packages\pymorphy2_dicts\data
2018-06-17 05:46:13,203 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2018-06-17 05:46:13,208 : INFO : loading projection weights from araneum_upos_skipgram_300_2_2018.vec.gz
2018-06-17 05:47:51,831 : INFO : loaded (196620, 300) matrix from araneum_upos_skipgram_300_2_2018.vec.gz


Wall time: 1min 49s


In [77]:
while True: 
    i = input("...: ")
    if i.lower() == 'стоп': break 
    print(guru.answer(i))

...: стоп


In [108]:
with open('botfiles\gurubot_ruscorpora_upos_skipgram_300_5_2018_1529437532.pkl', 'rb') as f:
    bott = pickle.load(f)

2018-06-19 22:46:58,859 : INFO : Loading dictionaries from c:\users\mytas\appdata\local\programs\python\python36\lib\site-packages\pymorphy2_dicts\data
2018-06-19 22:46:59,824 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


In [16]:
%%time
hello = GuruBot(
    model_path = 'ruscorpora_upos_skipgram_300_5_2018.vec.gz', 
    data_paths = [
        'hellos.txt'
    ],
    morph = pymorphy2.MorphAnalyzer(),
    botfile = 'hellobot',
    start = []
)

2018-06-22 02:42:50,843 : INFO : Loading dictionaries from c:\users\mytas\appdata\local\programs\python\python36\lib\site-packages\pymorphy2_dicts\data
2018-06-22 02:42:50,996 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2018-06-22 02:42:51,002 : INFO : loading projection weights from ruscorpora_upos_skipgram_300_5_2018.vec.gz
2018-06-22 02:46:57,064 : INFO : loaded (195071, 300) matrix from ruscorpora_upos_skipgram_300_5_2018.vec.gz


Wall time: 4min 12s


In [18]:
while True: 
    i = input("...: ")
    if i.lower() == 'стоп': break 
    print(hello.answer(i))

...: Привет!
Привет! ☕
...: Как тебя зовут:
Меня не зовут, я сам прихожу... Простите. 
...: Ты мне нравишься
Рад вас видеть 
...: Давай обнимемся
Хорошего вам дня! 
...: стоп
