# Paraphrase Detection

In [1]:
from lxml import etree       
import numpy as np
import pandas as pd
import chardet
import pymorphy2
from sklearn.ensemble import GradientBoostingClassifier
import re
import pickle
import os
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec
import json
import io

In [2]:
#необходимо для красивой печати токенов на русском
def bprint(x):
    for x_ in x:
        print x_
#     print 

## The creation of dataset

In [26]:
#формирую dataframe из предложений - информация о парафразах
#Paraphrase classes: -1: non-paraphrases, 0: loose paraphrases, 1: strict paraphrases.
##Для бинарной классификации: 0 - не парафразы, 1 - парафразы
tree = etree.parse('paraphrases.xml')  
root = tree.getroot() 
corpus = root[1]
data = []

for paraphrase in corpus:
    new_pair_data = []
    for field in paraphrase:
        new_pair_data.append(field.text.encode('utf-8'))
    data.append(new_pair_data)

Paraphrases = pd.DataFrame(np.asarray(data), columns = ['pair_id', 'id_1', 'id_2', 'text_1', 'text_2', 'jaccard', 'class'])
Paraphrases[['pair_id', 'id_1', 'id_2', 'jaccard', 'class']] = \
        Paraphrases[['pair_id', 'id_1', 'id_2', 'jaccard', 'class']].apply(pd.to_numeric)
Paraphrases['class'] = Paraphrases['class'].apply(lambda x: 1 if x >=0 else 0)
Paraphrases.to_csv("Paraphrases.csv", index=False, encoding='utf-8')

In [27]:
#формирую dataframe для предложений - информация о предложениях
#Paraphrase classes: -1: non-paraphrases, 0: loose paraphrases, 1: strict paraphrases.
tree = etree.parse('corpus.xml')  
root = tree.getroot() 
corpus = root[1]
data = []

for sentense in corpus:
    new_sentense = []
    for field in sentense:
        if field.text is None:
            new_sentense.append(None)
        else:
            new_sentense.append(field.text.encode('utf-8'))
    data.append(new_sentense)

Sentences = pd.DataFrame(np.asarray(data), columns = ['id', 'text', 'agency', 'author', 'url', 'date'])
Sentences[['id']] = Sentences[['id']].apply(pd.to_numeric) 
Sentences.to_csv("Sentences.csv",index = False,  encoding='utf-8')

In [28]:
print "Train data shape: ", Paraphrases.shape
Paraphrases.head()

Train data shape:  (7227, 7)


Unnamed: 0,pair_id,id_1,id_2,text_1,text_2,jaccard,class
0,1,201,8159,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0.65,1
1,2,202,8158,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0.5,1
2,3,273,8167,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0.611429,1
3,4,220,8160,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,0.324037,0
4,5,223,8160,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,0.606218,1


In [29]:
print "Sentences number: ", Sentences.shape
Sentences.head()

Sentences number:  (12062, 6)


Unnamed: 0,id,text,agency,author,url,date
0,1,"Избежать ""фискального обрыва"": Сенат США подде...",РБК,,http://top.rbc.ru/economics/01/01/2013/839229....,2013-01-01
1,2,"""Фискальный обрыв"" в США временно предотвращен.",РБК,,http://top.rbc.ru/economics/01/01/2013/839223....,2013-01-01
2,3,Чечня попросила националистов составить кодекс...,РБК,,http://top.rbc.ru/society/01/01/2013/839242.shtml,2013-01-01
3,4,Северокорейский лидер впервые за 19 лет поздра...,РБК,,http://top.rbc.ru/society/01/01/2013/839227.shtml,2013-01-01
4,5,В Кот-Д`Ивуаре десятки человек погибли в давке...,РБК,,http://top.rbc.ru/incidents/01/01/2013/839240....,2013-01-01


In [30]:
Paraphrases = pd.read_csv("Paraphrases.csv")
print Paraphrases.head()
Sentences = pd.read_csv("Sentences.csv")
print Sentences.head()

   pair_id  id_1  id_2                                             text_1  \
0        1   201  8159  Полицейским разрешат стрелять на поражение по ...   
1        2   202  8158  Право полицейских на проникновение в жилище ре...   
2        3   273  8167  Президент Египта ввел чрезвычайное положение в...   
3        4   220  8160  Вернувшихся из Сирии россиян волнует вопрос тр...   
4        5   223  8160  В Москву из Сирии вернулись 2 самолета МЧС с р...   

                                              text_2   jaccard  class  
0  Полиции могут разрешить стрелять по хулиганам ...  0.650000      1  
1  Правила внесудебного проникновения полицейских...  0.500000      1  
2  Власти Египта угрожают ввести в стране чрезвыч...  0.611429      1  
3  Самолеты МЧС вывезут россиян из разрушенной Си...  0.324037      0  
4  Самолеты МЧС вывезут россиян из разрушенной Си...  0.606218      1  
   id                                               text agency author  \
0   1  Избежать "фискального об

## Lemmatization and constructing the inverted index

In [3]:
RusLem = pymorphy2.MorphAnalyzer()
#обратный индес: {токен:список id предложений,в которых токен встречается}
inverted_index = {}
#прямой индекс: {id предложения:список токенов}
forward_index = {}

In [18]:
np_sentences = np.asarray(Sentences[['id', 'text']])

for sent in np_sentences:
    tokens = re.findall('[\w]+',sent[1].decode("utf-8").strip().lower(), re.U)
    tokens = [RusLem.parse(token)[0].normal_form for token in tokens]
    forward_index[sent[0]] = tokens

In [10]:
def create_inverted_index(forward_index):
    inverted_index = {}
    for sent_id in forward_index.keys():
        for token in forward_index[sent_id]:
            if token not in inverted_index:
                inverted_index[token] = [sent_id]
            elif sent_id not in inverted_index[token]:
                inverted_index[token].append(sent_id)
    return inverted_index

In [20]:
# bprint(np_sentences[:5, 1])
inverted_index = create_inverted_index(forward_index)

In [103]:
with open("./forward_index", 'wb') as res_file:
    pickle.dump(forward_index, res_file)
with open("./inverted_index", 'wb') as res_file:
    pickle.dump(inverted_index, res_file)

# Baseline model

## String - based features

In [31]:
np_paraphrases = np.asarray(Paraphrases[['pair_id', 'id_1', 'id_2']])
print np_paraphrases[len(np_paraphrases) - 10:]

[[25495 34424 34611]
 [25496 34596 34611]
 [25498 34488 34613]
 [25499 34614 34615]
 [25500 34616 34617]
 [25514 34622 34633]
 [25524 34566 34654]
 [25548 34519 34681]
 [25549 34565 34681]
 [25577 34584 34722]]


![Image](img1.png)

In [13]:
#возвращает значение признака для 1 пары предложений
#paraphrase = [id_1, id_2]
def get_string_feature(forward_index, paraphrase, n_gram = 1, gram_type = 'word', feat_num = 1):
    id_1 = paraphrase[0]
    id_2 = paraphrase[1]
    if gram_type == 'word':
        tokens_1 = forward_index[id_1]
        tokens_2 = forward_index[id_2]
        if n_gram == 1:
            set_1 = set(tokens_1)
            set_2 = set(tokens_2)
        elif n_gram == 2:
            set_1 = set([" ".join([tokens_1[idx], tokens_1[idx + 1]]) for idx in range(len(tokens_1) - 1)])
            set_2 = set([" ".join([tokens_2[idx], tokens_2[idx + 1]]) for idx in range(len(tokens_2) - 1)])
        elif n_gram == 3:
            set_1 = set([" ".join([tokens_1[idx], tokens_1[idx + 1], tokens_1[idx + 2]]) \
                                                     for idx in range(len(tokens_1) - 2)])
            set_2 = set([" ".join([tokens_2[idx], tokens_2[idx + 1], tokens_2[idx + 2]])\
                                                     for idx in range(len(tokens_2) - 2)])
        else:
            print "Not correct n_gram parameter"
            return None

    elif gram_type == 'symbol':
        text_1 = " ".join(forward_index[id_1])
        text_2 = " ".join(forward_index[id_2])
        if n_gram == 2:
            set_1 = set(["".join([text_1[idx], text_1[idx + 1]]) for idx in range(len(text_1) - 1)])
            set_2 = set(["".join([text_2[idx], text_2[idx + 1]]) for idx in range(len(text_2) - 1)])
        elif n_gram == 3:
            set_1 = set(["".join([text_1[idx], text_1[idx + 1], text_1[idx + 2]])\
                         for idx in range(len(text_1) - 2)])
            set_2 = set(["".join([text_2[idx], text_2[idx + 1], text_2[idx + 2]])\
                         for idx in range(len(text_2) - 2)])
        else:
            print "Not correct n_gram parameter"
            return None
    else:
        print "Not correct gram_type parameter"
        return None    

    if feat_num == 1:
        feature = len(set_1.intersection(set_2)) / float(len(set_1.union(set_2))) \
                                                    if len(set_1.union(set_2)) != 0 else 0
    elif feat_num == 2:
        feature = len(set_1.intersection(set_2)) / float(len(set_1)) if len(set_1) != 0 else 0
    elif feat_num == 3:
        feature = len(set_1.intersection(set_2)) / float(len(set_2)) if len(set_2) != 0 else 0
    else:
        print "Not correct feat_num parameter"
        return None
    
    return feature

#подсчет string - based features для всего датасета перифраз *создание numpy array*
#всего 15 фичей: 9 для словарных N-Gram(1, 2, 3 слов), 6 для символьных(2 и 3 символов)
def get_string_feature_for_all(forward_index, paraphrases):
    all_string_features = []
    for paraphrase in paraphrases:
#         features = [paraphrase[0]]
#         print "PAIR_ID", paraphrase[0]
#         bprint(forward_index[paraphrase[1]])
#         bprint(forward_index[paraphrase[2]])
        features = []
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=1, gram_type='word', feat_num=1))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=2, gram_type='word', feat_num=1))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=3, gram_type='word', feat_num=1))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=1, gram_type='word', feat_num=2))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=2, gram_type='word', feat_num=2))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=3, gram_type='word', feat_num=2))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=1, gram_type='word', feat_num=3))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=2, gram_type='word', feat_num=3))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=3, gram_type='word', feat_num=3))
        
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=2, gram_type='symbol', feat_num=1))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=3, gram_type='symbol', feat_num=1))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=2, gram_type='symbol', feat_num=2))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=3, gram_type='symbol', feat_num=2))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=2, gram_type='symbol', feat_num=3))
        features.append(get_string_feature(forward_index, paraphrase[1:], n_gram=3, gram_type='symbol', feat_num=3))
        all_string_features.append(features)
    return np.asarray(all_string_features)

In [23]:
bprint(forward_index[np_paraphrases[0, 1]])
print 
bprint(forward_index[np_paraphrases[0, 2]])
get_string_feature(forward_index, [np_paraphrases[0, 1], np_paraphrases[0, 2]], n_gram=3, gram_type='symbol', feat_num=1)

полицейский
разрешить
стрелять
на
поражение
по
гражданин
с
травматика

полиция
мочь
разрешить
стрелять
по
хулиган
с
травматика


0.4146341463414634

In [24]:
string_features = get_string_feature_for_all(forward_index, np_paraphrases)

In [25]:
print np.asarray(string_features).shape
with open("./string_features", 'wb') as res_file:
    pickle.dump(string_features, res_file)

(7227, 15)


## IR features

    N - количество предложений 
    N(w_i) - количество предложений в которых встречается терм
    avg - средняя длина предложения в датасете

![Image](./img2.png)

In [23]:
#количество предложений
N = len(forward_index.keys())

avg = 0
for sent in forward_index.keys():
    avg += len(forward_index[sent])
    
#средняя длина датасета
avg /= float(N)

In [32]:
def get_TF(token, sent_tokens):
    tf = 0
    for t in sent_tokens:
        if t == token:
            tf += 1
    return tf

def create_IDF(inverted_index):
    # IDF = {токен: его IDF}
    IDF = {}
    for token in inverted_index.keys():
        IDF[token] = np.log((N - len(inverted_index[token]) + 0.5) / float(len(inverted_index[token]) + 0.5))
    return IDF
        
def BM25(id_1, id_2, forward_index, IDF, k = 1.2, b = 0.75):
    tokens_1 = forward_index[id_1]
    tokens_2 = forward_index[id_2]
#     bprint(tokens_1)
#     print
#     bprint(tokens_2)
    bm25 = 0
    for token in tokens_1:
        tf = get_TF(token, tokens_2)
        tmp = (tf * (k + 1)) / float(tf + k * (1 - b + b * len(tokens_2)/avg))
        bm25 += IDF[token] * tmp
    return bm25

#максимальный IDF слов, которыми различаются 2 предложения
def maxIDF(id_1, id_2, forward_index, IDF):
    tokens_1 = set(forward_index[id_1])
    tokens_2 = set(forward_index[id_2])
    difference = list(tokens_1.symmetric_difference(tokens_2))
    if len(difference) == 0:
        return 0
    difference_IDF = [IDF[token] for token in difference]
    return max(difference_IDF)

#сумма IDF слов, которыми различаются 2 предложения
def sumIDF(id_1, id_2, forward_index, IDF):
    tokens_1 = set(forward_index[id_1])
    tokens_2 = set(forward_index[id_2])
    difference = list(tokens_1.symmetric_difference(tokens_2))
    difference_IDF = [IDF[token] for token in difference]
    return sum(difference_IDF)

#формирует список всех IR фичей - BM25, maxIDF, sumIDF
def get_IR_features_all(forward_index, inverted_index, paraphrases):
    all_string_features = []
    IDF = create_IDF(inverted_index)
#     print IDF
    for paraphrase in paraphrases:
        id_1 = paraphrase[1]
        id_2 = paraphrase[2]
        features = []
        features.append(BM25(id_1, id_2, forward_index, IDF))
        features.append(maxIDF(id_1, id_2, forward_index, IDF))
        features.append(sumIDF(id_1, id_2, forward_index, IDF))
        all_string_features.append(features)
    return np.asarray(all_string_features)

In [33]:
IR_features = get_IR_features_all(forward_index, inverted_index, np_paraphrases)

In [34]:
print IR_features[:5]

[[ 25.51432783   7.38237325  34.89266782]
 [ 22.21936364   8.99230873  42.4968678 ]
 [ 23.43991854   8.4814002   34.48990074]
 [ 12.27017085   8.99230873  60.84430134]
 [ 21.92113702   6.87113279  36.32048699]]


In [30]:
print avg

7.96857900846


In [4]:
#выгрузка данных
with open("./forward_index", 'r') as res_file:
    forward_index = pickle.load(res_file)
with open("./inverted_index", 'r') as res_file:
    inverted_index = pickle.load(res_file)
with open("./string_features", 'r') as res_file:
    string_features = pickle.load(res_file)

## Load Test Data

In [5]:
def get_test_data(class_cnt = 2):
    tree = etree.parse('paraphrases_gold.xml')  
    root = tree.getroot() 
    corpus = root[0]
    data = []

    for paraphrase in corpus:
        new_pair_data = []
        for field in paraphrase:
            new_pair_data.append(field.text.encode('utf-8'))
        data.append(new_pair_data)

    Paraphrases = pd.DataFrame(np.asarray(data), columns = ['pair_id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'])
    Paraphrases[['pair_id', 'id_1', 'id_2', 'class']] = \
            Paraphrases[['pair_id', 'id_1', 'id_2', 'class']].apply(pd.to_numeric)
    if class_cnt == 2:
        Paraphrases['class'] = Paraphrases['class'].apply(lambda x: 1 if x >=0 else 0)
    Paraphrases.to_csv("Paraphrases_test.csv", index=False, encoding='utf-8')
    return Paraphrases

In [6]:
np_test_data = np.asarray(get_test_data())
# np_test_sentences = np_test_data[:,[1, 2]].copy()

In [7]:
#обратный индекс для тестовых данных: {токен:список id предложений,в которых токен встречается}
test_inverted_index = {}
#прямой индекс для тестовых данных: {id предложения:список токенов}
test_forward_index = {}

In [8]:
for pair in np_test_data:
    tokens = re.findall('[\w]+',pair[3].decode("utf-8").strip().lower(), re.U)
    tokens = [RusLem.parse(token)[0].normal_form for token in tokens]
    test_forward_index[pair[1]] = tokens
    tokens = re.findall('[\w]+',pair[4].decode("utf-8").strip().lower(), re.U)
    tokens = [RusLem.parse(token)[0].normal_form for token in tokens]
    test_forward_index[pair[2]] = tokens

In [11]:
test_inverted_index = create_inverted_index(test_forward_index)

In [15]:
test_string_features = get_string_feature_for_all(test_forward_index, np_test_data[:, :3])

In [19]:
test_IR_features = get_IR_features_all(test_forward_index, test_inverted_index, np_test_data[:, :3])

In [20]:
test_all_features = np.concatenate((test_string_features, test_IR_features), axis = 1)

In [21]:
print test_all_features.shape
print test_string_features.shape
print test_IR_features.shape

(1924, 18)
(1924, 15)
(1924, 3)


## Тrain Baseline model

### Only String Features

In [38]:
target = np.asarray(Paraphrases[['class']])

In [39]:
print string_features.shape
print target.shape

(7227, 15)
(7227, 1)


In [45]:
string_model = GradientBoostingClassifier(loss = 'deviance')

In [47]:
parametrs_grid = {'learning_rate':[0.001,0.005, 0.01, 0.05, 0.1], 'n_estimators': [100, 300, 600, 800, 1000, 1200, 1400],\
                 'subsample':[0.6, 0.7, 0.8, 1.], 'max_depth':[2, 3, 4, 5]}

In [49]:
f1_metric = make_scorer(f1_score)
grid_model = GridSearchCV(string_model, parametrs_grid, cv = 7, verbose = 1, scoring=f1_metric, n_jobs=10)
grid_model.fit(string_features, target.ravel())

Fitting 7 folds for each of 560 candidates, totalling 3920 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   19.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  8.4min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 23.8min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 68.3min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed: 93.2min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed: 144.6min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed: 201.9min
[Parallel(n_jobs=10)]: Done 3180 tasks      | elapsed: 266.1min
[Parallel(n_jobs=10)]: Done 3920 out of 3920 | elapsed: 329.8min finished


GridSearchCV(cv=7, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=10,
       param_grid={'n_estimators': [100, 300, 600, 800, 1000, 1200, 1400], 'subsample': [0.6, 0.7, 0.8, 1.0], 'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1], 'max_depth': [2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=1)

In [50]:
grid_model.best_params_

{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.6}

In [51]:
best_GB = GradientBoostingClassifier(loss = 'deviance', learning_rate=0.01, \
                                     max_depth=4, n_estimators=100, subsample=0.6)
best_GB.fit(string_features, target.ravel())

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=0.6, verbose=0, warm_start=False)

In [61]:
test_predict = best_GB.predict(test_string_features)
print "BEST Gradient Boosting Result on String Features"
print "F1-score", f1_score(np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict)
print "Accuracy:", accuracy_score(np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict)
# print (classification_report( np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict, target_names= ['non-paraphrases', 'paraphrases']))

BEST Gradient Boosting Result on String Features
F1-score 0.789415656009
Accuracy: 0.702182952183


In [58]:
f1_score(np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict)

0.78941565600882035

In [None]:
with openen("string_model", "wb") as res_file:
    pickle.dump(best_GBres_files_file)

In [22]:
target = np.asarray(Paraphrases[['class']])
print IR_features.shape
print target.shape

(7227, 3)
(7227, 1)


### Only IR features

In [54]:
IR_model = GradientBoostingClassifier(loss = 'deviance')

In [62]:
parametrs_grid = {'learning_rate':[0.005, 0.01, 0.1], 'n_estimators': [100, 300, 500, 700, 1000, 1300],\
                 'subsample':[0.6, 0.8, 1.], 'max_depth':[2, 3, 4, 5]}

In [64]:
f1_metric = make_scorer(f1_score)
grid_model = GridSearchCV(IR_model, parametrs_grid, cv = 7, verbose = 1, scoring=f1_metric, n_jobs=5)
grid_model.fit(IR_features, target.ravel())

Fitting 7 folds for each of 216 candidates, totalling 1512 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   16.6s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:  3.6min
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed: 12.1min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed: 23.1min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed: 38.0min
[Parallel(n_jobs=5)]: Done 1512 out of 1512 | elapsed: 48.5min finished


GridSearchCV(cv=7, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'n_estimators': [100, 300, 500, 700, 1000, 1300], 'subsample': [0.6, 0.8, 1.0], 'learning_rate': [0.005, 0.01, 0.1], 'max_depth': [2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=1)

In [65]:
grid_model.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}

In [43]:
best_GB_IR = GradientBoostingClassifier(loss = 'deviance', learning_rate=0.01, \
                                     max_depth=3, n_estimators=100, subsample=1.0)
best_GB_IR.fit(IR_features, target.ravel())

test_predict = best_GB_IR.predict(test_IR_features)
print "BEST Gradient Boosting Result on IR Features"
print 
print "F1-score", f1_score(np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict)
print "Accuracy:", accuracy_score(np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict)
# print (classification_report( np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict, target_names= ['non-paraphrases', 'paraphrases']))

BEST Gradient Boosting Result on IR Features

F1-score 0.779085872576
Accuracy: 0.668399168399


### String + IR features

In [35]:
all_features = np.concatenate((string_features, IR_features), axis = 1)
print all_features.shape

(7227, 18)


In [36]:
string_IR_model = GradientBoostingClassifier(loss = 'deviance')
parametrs_grid = {'learning_rate':[0.005, 0.01, 0.05], 'n_estimators': [100, 300, 500, 700, 1000, 1300],\
                 'subsample':[0.6, 0.7, 0.8, 1.], 'max_depth':[2, 3, 4, 5]}

In [40]:
f1_metric = make_scorer(f1_score)
grid_model = GridSearchCV(string_IR_model, parametrs_grid, cv = 7, verbose = 1, n_jobs=5, scoring=f1_metric)
grid_model.fit(all_features, target.ravel())

Fitting 7 folds for each of 288 candidates, totalling 2016 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   33.1s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:  8.6min
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed: 27.0min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed: 64.1min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed: 97.8min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed: 140.9min
[Parallel(n_jobs=5)]: Done 2016 out of 2016 | elapsed: 170.5min finished


GridSearchCV(cv=7, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'n_estimators': [100, 300, 500, 700, 1000, 1300], 'subsample': [0.6, 0.7, 0.8, 1.0], 'learning_rate': [0.005, 0.01, 0.05], 'max_depth': [2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=1)

In [42]:
grid_model.best_params_

{'learning_rate': 0.005, 'max_depth': 2, 'n_estimators': 500, 'subsample': 0.7}

In [44]:
best_GB_IR_string = GradientBoostingClassifier(loss = 'deviance', learning_rate=0.05, \
                                     max_depth=2, n_estimators=500, subsample=0.7)
best_GB_IR_string.fit(all_features, target.ravel())

test_predict = best_GB_IR_string.predict(test_all_features)
print "BEST Gradient Boosting Result on String+IR Features"
print 
print "F1-score", f1_score(np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict)
print "Accuracy:", accuracy_score(np.asarray(np_test_data[:, -1], dtype= np.int64), test_predict)

BEST Gradient Boosting Result on String+IR Features

F1-score 0.80221402214
Accuracy: 0.721413721414


## Experiments with Word2Vec

In [47]:
! ls -1 ../news | wc -l

291584


In [3]:
RusLem = pymorphy2.MorphAnalyzer()

In [None]:
#формирование датасета для W2V из тренировочной выборки
sentences_text = np.asarray(Sentences[['text']])
print bprint(sentences_text[0])
print len(sentences_text)
sentences = []

for sent in sentences_text:
    tokens = re.findall('[\w]+',sent[0].decode("utf-8").strip().lower(), re.U)
    tokens = [RusLem.parse(token)[0].normal_form for token in tokens]
    sentences.append(tokens)
    
print bprint(sentences[0])
print len(sentences)

In [None]:
#предобработка датасета новостных статей
news_files = os.listdir("../news")
# news_text = ""
with io.open("news_text", 'w', encoding='utf-8') as news_text_file:
    for idx, f in enumerate(news_files):
        if idx % 10000 == 0:
            print "Prepare", idx, 'file'
        with open("../news/" + f, 'r') as news_file:
            js = json.load(news_file, encoding='utf-8')
            news_text_file.write(js['text'] +  \
                                 js['title'] + '\n')

print json.load(file("../news/" + 'news_0246791.json', 'r'))['text']
print 
print json.load(file("../news/" + 'news_0246791.json', 'r'))['title']
json.load(file("../news/" + 'news_0246791.json', 'r')).keys()

Prepare 0 file
Prepare 10000 file
Prepare 20000 file
Prepare 30000 file
Prepare 40000 file
Prepare 50000 file
Prepare 60000 file
Prepare 70000 file
Prepare 80000 file
Prepare 90000 file
Prepare 100000 file
Prepare 110000 file
Prepare 120000 file
Prepare 130000 file
Prepare 140000 file
Prepare 150000 file
Prepare 160000 file
Prepare 170000 file
Prepare 180000 file
Prepare 190000 file
Prepare 200000 file
Prepare 210000 file
Prepare 220000 file


In [None]:
!wc -l news_text

In [None]:
#формирование данных для обучения Word2Vec из датасета новостных статей на русском
new_sentences = []
with open("./news_text", "r") as news_text:
    for ind, line in enumerate(news_text):
        if ind % 10000 == 0:
            print "Prepare", ind, "line"
        for sent in line.strip().split('.'):
#             print sent
            tokens = re.findall('[\w]+',sent.decode("utf-8").strip().lower(), re.U)
            tokens = [RusLem.parse(token)[0].normal_form for token in tokens]
            new_sentences.append(tokens)

In [None]:
#объединение тренировочного датасета и датасета новостей

In [None]:
#обучение Word2Vec
model = Word2Vec(size=200, window=7, min_count=0)
model.build_vocab(sentences)

In [None]:
#создание прямого индекса из embedding слов

## Word2Vec features

Возможные признаки:
    1. Усреднение векторов всех слов
    2. Максимальная близость между двумя словами в предложении - ?? 
    3. Максимальная близость между словами, которыми отличаются 2 предложения
    4. Взвешенная сумма векторов слов: умноженное на IDF слова
    3. Попарная близость все слов 2-х предложений -> N * N фичей, где N - максимальное количество слов в предложении