In [20]:
# -*- coding: utf-8 -*-
from nltk.tokenize import wordpunct_tokenize 
from itertools import chain
import nltk
import sklearn_crfsuite
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import pickle
from nltk import stem
from nltk import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import json
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def word2features_per(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    word_ = stemmer.stem(word)
    features = {
        'bias': 1.0,
        'word.ismon():': is_month(word_),
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        word_l1 = word1.lower()
        word1_ = stemmer.stem(word1)
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.ismon():': is_month(word1_),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word2 = sent[i+1][0]
        postag2 = sent[i+1][1]
        word_l1 = word2.lower()
        word2_ = stemmer.stem(word2)
        features.update({
            '+1:word.lower()': word2.lower(),
            '+1:word.ismon():': is_month(word2_),
            '+1:word.istitle()': word2.istitle(),
            '+1:word.isupper()': word2.isupper(),
            '+1:word.isdigit()': word2.isdigit(),
            '+1:postag': postag2,
            '+1:postag[:2]': postag2[:2],
        })
    else:
        features['EOS'] = True

    return features

In [3]:
model = ""
stemmer = nltk.PorterStemmer()
def is_month(mon):
    mons = np.array(['january', 'february', 'march', 'april', 'may', 'june', 'jule', 'august', 'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'])
    return mon in mons

In [4]:
def sent2features_per(sent):
    return [word2features_per(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [5]:
file = open("files/crf_model",'rb')
crf_per = pickle.load(file)
file.close()

In [6]:
def mod_txt(txt):
    text_n = wordpunct_tokenize(txt)
    frames = []
    for w, ch in zip(text_n, nltk.pos_tag(text_n)):
        frames.append([w, ch[1].replace(",", "ZPT").replace(".", "TCHK").replace("?", "VOPR").replace(":", "DVTC").replace("!", "VOSKL").replace(";", "TZPT").replace("(", "SKBL").replace(")", "SKBR")])
    return np.array(frames), text_n

In [7]:
# функция удаляет найденные даты из тестового описания и формирует из них отдельный массив
def del_time(txt):
    dates = []
    mod_t, text_ = mod_txt(txt)
    X_test = [sent2features_per(mod_t)]
    res = crf_per.predict(X_test)
    indices_b = [i for i, x in enumerate(res[0]) if x == "B-TIME"]
    indices_i = [i for i, x in enumerate(res[0]) if x == "I-TIME"]
    for i in indices_b:
        if (i+1) in indices_i:
            dates.append(text_[i] + " " + text_[i+1])
            text_[i+1] = ""
        else:
            dates.append(text_[i])
        text_[i] = ""
    return ' '.join(text_), dates

In [9]:
def clean_str(string):
    string = re.sub(ur"[^A-Za-z]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [10]:
file = open("files/tf_model",'rb')
vect_ = pickle.load(file)
file.close()

In [14]:
# Используется не весь файл, а только 1500 записей, ноутбук слабенький, тормозит сильно
dt = json.loads(open('test.json').read())

In [15]:
result = []
# Проход по всем записям и формирование нового массива словарей
for dt_ in dt:
    row = " ".join(dt_["attributes"][0]["row"]["titles"])
    column = " ".join(dt_["attributes"][0]["column"]["titles"])
    table = dt_["attributes"][0]["table"]
    t1, d1 = del_time(row)
    t2, d2 = del_time(column)
    t3, d3 = del_time(table)
    t = {}
    t["id"] = dt_["_id"]["$oid"]
    t["semanticId"] = dt_['semanticId']["$oid"]
    t["placeId"] = dt_['placeId']["$oid"]
    t["txt"] = vect_.transform([t1+" "+t2+" "+t3])
    t["dt"] = d1+d2+d3
    result.append(t)

In [28]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [41]:
X = []
y = []
# Формирование обучающей выборки, где используется два признака, близость векторных представлений текстовых описаний 
# и нормированное пересечение множеств дат
for i in log_progress(range(train)):
    for j in range(train):
        if i!=j:
            temp_arr = []
            temp_arr.append(cosine_similarity(result[i]["txt"].todense(), result[j]["txt"].todense())[0][0])
            max_len = max(len(result[i]["dt"]), len(result[j]["dt"]))
            if max_len == 0:
                temp_arr.append(0)
            else:
                temp_arr.append(len(set.intersection(set(result[i]["dt"]), set(result[j]["dt"])))/float(max_len))
            X.append(temp_arr)
            if result[i]["semanticId"] == result[j]["semanticId"] or result[i]["placeId"] == result[j]["placeId"]:
                y.append(1)
            else:
                y.append(0)

In [45]:
X = np.array(X)
y = np.array(y).reshape(-1,1)

In [46]:
X.shape

(2248500, 2)

In [47]:
y.shape

(2248500, 1)

In [48]:
from sklearn.linear_model import LogisticRegression

In [49]:
# Обучается классификатор и сохраняется на диск
lr = LogisticRegression()
lr.fit(X,y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
filehandler = open("files/logreg","wb")
pickle.dump(lr, filehandler)
filehandler.close()