In [1]:
import pandas as pd
from lxml import html
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict
from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline

morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)

  from numpy.core.umath_tests import inner1d


In [2]:
tfidf=TfidfVectorizer()
import adagram

In [12]:
def get_embedding_adagram(text, model, window, dim):
    text = text.split()
    
    
    word2context = []
    for i in range(len(text)-1):
        left = max(0, i-window)
        word = text[i]
        left_context = text[left:i]
        right_context = text[i+1:i+window]
        context = left_context + right_context
        word2context.append((word, context))
    
    
    
    vectors = np.zeros((len(word2context), dim))
    
    for i,word in enumerate(word2context):
        word, context = word
        try:
            sense = model.disambiguate(word, context).argmax()
            v = model.sense_vector(word, sense)
            vectors[i] = v # просто умножаем вектор на частоту
        
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

def get_embedding(text, model, dim):
    text = text.split()
    
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [4]:
data_rt = pd.read_csv('news_texts.csv', encoding = 'utf-8')
data_rt.dropna(inplace=True)
corpus = ' '.join(data_rt.content_norm)
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []
f = open('corpus.txt', 'w', encoding = 'utf-8')
f.write(corpus)
f.close()
for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

<b>SVD</b>  
Берем все из тетрадки 4 семинара.

In [5]:
cv = CountVectorizer(max_features=7227)
X = cv.fit_transform(data['text_1_norm'])
Y = cv.fit_transform(data['text_2_norm'])
svd = TruncatedSVD(50)

In [6]:
svd_X = svd.fit(X)
svd_Y = svd.fit(Y)
id2vec_svd_X = svd_X.components_.T
id2vec_svd_Y = svd_Y.components_.T

In [7]:
cosim_svd = cosine_distances(id2vec_svd_X,id2vec_svd_Y)

<b>NMF</b>  
Берем оттуда же.

In [8]:
nmf = NMF(50)
nmf_X = nmf.fit(X)
nmf_Y = nmf.fit(Y)
id2vec_nmf_X = nmf_X.components_.T
id2vec_nmf_Y = nmf_Y.components_.T

In [9]:
cosim_nmf = cosine_distances(id2vec_nmf_X,id2vec_nmf_Y)

<b>Word2Vec</b>

In [10]:
w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=50, sg=1)

In [13]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)



In [14]:
cosim_w2v = cosine_distances(X_text_1_w2v, X_text_2_w2v)

<b>Fasttext</b>

In [15]:
fast_text = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=50, min_n=4, max_n=8)

In [16]:
dim = 50
X_text_1_Ft = np.zeros((len(data['text_1_norm']), dim))
X_text_2_Ft = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_Ft[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_Ft[i] = get_embedding(text, fast_text, dim)



In [17]:
cosim_Ft = cosine_distances(X_text_1_Ft, X_text_2_Ft)

<b>Adagram</b>

In [18]:
vm = adagram.VectorModel.load("out.pkl")
dim = 50
X_text_1_vm = np.zeros((len(data['text_1_norm']), dim))
X_text_2_vm = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_vm[i] = get_embedding_adagram(text, vm, 5, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_vm[i] = get_embedding_adagram(text, vm, 5, dim)

  z = np.log(z)


In [19]:
cosim_vm = cosine_distances(X_text_1_vm,X_text_2_vm)

Посмотрим на пары.

In [20]:
X_test = np.concatenate((cosim_svd, cosim_nmf, cosim_Ft, cosim_w2v, cosim_vm), axis=-1)

In [21]:
y = data['label'].values

In [22]:
train_X, valid_X, train_y, valid_y = train_test_split(X_test, y, random_state=1)

In [25]:
clf = RandomForestClassifier(n_estimators=20)
print(np.mean(cross_val_score(clf, train_X, train_y,scoring="f1_micro")))

0.41955783699133214


Попробуем улучшить результат изменением параметров:

In [26]:
clf = RandomForestClassifier(n_estimators=50)
print(np.mean(cross_val_score(clf, train_X, train_y,scoring="f1_micro")))

0.44446466849009925


Это улучшило результат. Попробуем еще добавить.

In [28]:
clf = RandomForestClassifier(n_estimators=100, max_depth=15)
print(np.mean(cross_val_score(clf, train_X, train_y,scoring="f1_micro")))

0.4634682236321855


Еще немного улучшения результата, но уже незначительно.