In [1]:
import pandas as pd
from lxml import html
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict
from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline

morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)


Для обучения векторных представлений необходимо большое количество текста. Чем больше текста, тем лучше предтавления получатся.  
Возьмем ~7к новостных статей. Это все ещё маленький корпус, но для обучения он подходит (на нем можно достаточно быстро попробовать разные методы). 

In [2]:
data_rt = pd.read_csv('/Users/alinashaymardanova/Downloads/news_texts.csv')
data_rt.dropna(inplace=True)

In [3]:
data_rt['content_norm'] = data_rt['content_norm'].apply(str.split)
data_rt['tokenized'] = data_rt['content'].apply(tokenize)
data_rt.to_csv('news_texts.tsv', sep='\t', index=None)
data_rt.head()

Unnamed: 0,content,content_norm,tokenized
0,Канцлер Германии Ангела Меркель в ходе брифинг...,"[канцлер, германия, ангел, меркель, ход, брифи...",канцлер германии ангела меркель в ходе брифинг...
1,Российские и белорусские войска успешно заверш...,"[российский, белорусский, войско, успешно, зав...",российские и белорусские войска успешно заверш...
2,"Дзюба, Шатов и Анюков оказались не нужны «Зени...","[дзюба, шат, анюк, оказаться, нужный, зенит, р...",дзюба шатов и анюков оказались не нужны зениту...
3,"В Испанию без фанатов\nПожалуй, главной пятнич...","[испания, фанат, пожалуй, главный, пятничный, ...",в испанию без фанатов пожалуй главной пятнично...
4,"Постпред России при ООН Виталий Чуркин, говоря...","[постпред, россия, оон, виталий, чуркин, говор...",постпред россии при оон виталий чуркин говоря ...


In [4]:
corpus_xml = html.fromstring(open('/Users/alinashaymardanova/Downloads/paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)
data['text_1_tokenized'] = data['text_1'].apply(tokenize)
data['text_2_tokenized'] = data['text_2'].apply(tokenize)

data.to_csv('paraphrases.tsv', sep='\t', index=None)

In [5]:
data_rt = pd.read_csv('news_texts.tsv', sep='\t')
data = pd.read_csv('paraphrases.tsv', sep='\t')

In [6]:
def get_embedding(text, model, dim, n_documents=None, inv_idx=None):
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i, word in enumerate(words):
        try:
            v = model.wv[word]
            if inv_idx:
                vectors[i] = v * (words[word] / total) * log(n_documents / inv_idx[word])
            else:
                vectors[i] = v
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

### NMF

In [7]:
def similarity(v1, v2):
    v1_norm = gensim.matutils.unitvec(np.array(v1))
    v2_norm = gensim.matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)

def transformer(data, model, dim, inv_idx=None):
    n_documents = len(data)
    X_text = np.zeros((n_documents, dim))
                
    for i, text in enumerate(data):
        X_text[i] = get_embedding(text, model, dim, n_documents * 2, inv_idx)
    
    return X_text

def get_similarity(model, data_1, data_2, dim, vect=None, embeddings_needed=False, weighted_tfidf=False, tokenized=False):

    if embeddings_needed:
        
        if weighted_tfidf:
            if tokenized:
                X_text_1 = transformer(data_1.values, model, dim, inv_idx_tokenized)
                X_text_2 = transformer(data_2.values, model, dim, inv_idx_tokenized)
            else:
                X_text_1 = transformer(data_1.values, model, dim, inv_idx)
                X_text_2 = transformer(data_2.values, model, dim, inv_idx)
        else:    
            X_text_1 = transformer(data_1.values, model, dim)
            X_text_2 = transformer(data_2.values, model, dim)
    else:
        
        X_text_1 = model.transform(vect.transform(data_1))
        X_text_2 = model.transform(vect.transform(data_2))
        
    sim = [similarity(v1, v2) for v1, v2 in zip(X_text_1, X_text_2)]
    return sim

In [8]:
count = CountVectorizer(min_df=3, max_df=0.4, max_features=1000, lowercase=False, tokenizer=lambda x: x)
X_count = count.fit_transform(data_rt['content_norm'])

In [9]:
dim = 50
nmf_ = NMF(dim)
nmf_.fit(X_count)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [31]:
r_nmf = get_similarity(nmf_, data['text_1_norm'], data['text_2_norm'], dim, vect=count)

In [11]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000, lowercase=False, tokenizer=lambda x: x)
X_tfidf = tfidf.fit_transform(data_rt['content_norm'])

In [12]:
nmf_tfidf = NMF(dim)
nmf_tfidf.fit(X_tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [13]:
r_nmf_tfidf = get_similarity(nmf_tfidf, data['text_1_norm'], data['text_2_norm'], dim, vect=tfidf)

### SVD

In [14]:
svd_ = TruncatedSVD(dim)
svd_.fit(X_count)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [34]:
r_svd_count = get_similarity(svd_, data['text_1_norm'], data['text_2_norm'], dim, vect=count)

In [16]:
svd_tfidf = TruncatedSVD(dim)
svd_tfidf.fit(X_tfidf)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [17]:
r_svd_tfidf = get_similarity(svd_tfidf, data['text_1_norm'], data['text_2_norm'], dim, vect=tfidf)

### Fastext

##### Ne norm

In [22]:
from math import log

def idx(data):
    inverted_index = defaultdict(list)
    for i, doc in enumerate(data):
        for word in doc: 
            inverted_index[word].append(i)
    
    inv_idx = {word:len(inverted_index[word]) for word in inverted_index}
    return inv_idx

In [20]:
inv_idx = idx(np.concatenate([data['text_1_norm'], data['text_2_norm']], axis=0))
inv_idx_tokenized = idx(np.concatenate([data['text_1_tokenized'], data['text_1_tokenized']], axis=0))

In [21]:
ft = gensim.models.FastText(data_rt['tokenized'], size=dim, min_n=4, max_n=8)

In [40]:
res_ft = get_similarity(ft, data['text_1_tokenized'], data['text_2_tokenized'], dim, embeddings_needed=True)
res_ft_tfidf = get_similarity(ft, data['text_1_tokenized'], data['text_2_tokenized'], dim, embeddings_needed=True, weighted_tfidf=True, tokenized=True)

##### Norm

In [23]:
ft_norm = gensim.models.FastText(data_rt['content_norm'], size=50, min_n=4, max_n=8)
r_ft_norm = get_similarity(ft_norm, data['text_1_norm'], data['text_2_norm'], dim, embeddings_needed=True)
res_ft_norm_tfidf = get_similarity(ft_norm, data['text_1_norm'], data['text_2_norm'], dim, embeddings_needed=True, weighted_tfidf=True)

### word2vec

In [24]:
w2v = gensim.models.Word2Vec(data_rt['content_norm'], size=dim, sg=1)

In [26]:
r_w2v = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, embeddings_needed=True)

In [29]:
r_w2v_tfidf = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, embeddings_needed=True, weighted_tfidf=True)

### Объединим

In [50]:
result = pd.DataFrame({'nmf': r_nmf, 'nmf_tfidf': r_nmf_tfidf,
                    'svd': r_svd_count, 'svd_tfidf': r_svd_tfidf,
                    'ft': res_ft, 'ft_tfidf': res_ft_tfidf,
                    'ft_norm': r_ft_norm, 'ft_norm_tfidf': res_ft_norm_tfidf,
                    'w2v': r_w2v, 'w2v_tfidf': res_w2v_tfidf})

### LogisticRegression

In [47]:
from sklearn.model_selection import cross_val_score

In [51]:
lg = LogisticRegression(class_weight='balanced')
cross_val_score(lg, result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.44498552718470491

### RandomForestClassifier

In [53]:
rf = RandomForestClassifier(n_estimators=500, class_weight='balanced')
cross_val_score(rf, result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.48870008930535996

### Попробуем "поиграть" со значениями w2v, SVD и NMF

##### w2v

In [82]:
w2v = gensim.models.Word2Vec(data_rt['content_norm'], size=100, sg=1)

In [83]:
r_w2v = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, embeddings_needed=True)
r_w2v_tfidf = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, embeddings_needed=True, weighted_tfidf=True)
result.update(pd.DataFrame({'w2v': r_w2v, 
                            'w2v_tfidf': r_w2v_tfidf}))

In [84]:
cross_val_score(LogisticRegression(class_weight='balanced'), result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.44540218826781314

In [85]:
cross_val_score(RandomForestClassifier(n_estimators=500, class_weight='balanced'), result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.46629170672228132

##### svd

In [86]:
svd_ = TruncatedSVD(100)
svd_.fit(X_count)
r_svd = get_similarity(svd_, data['text_1_norm'], data['text_2_norm'], 100, vect=count)

In [87]:
svd_tfidf = TruncatedSVD(100)
svd_tfidf.fit(X_tfidf)
r_svd_tfidf = get_similarity(svd_tfidf, data['text_1_norm'], data['text_2_norm'], 100, vect=tfidf)

In [88]:
result.update(pd.DataFrame({'svd_count': r_svd, 
                            'svd_tfidf': r_svd_tfidf_second}))

In [74]:
cross_val_score(LogisticRegression(class_weight='balanced'), result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.44540218826781314

In [90]:
cross_val_score(RandomForestClassifier(n_estimators=500, class_weight='balanced'), result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.4676734945190234

##### nmf

In [91]:
nmf_ = NMF(100)
nmf_.fit(X_count)
r_nmf = get_similarity(nmf_, data['text_1_norm'], data['text_2_norm'], 100, vect=count)

In [92]:
nmf_tfidf = NMF(100)
nmf_tfidf.fit(X_tfidf)
r_nmf_tfidf = get_similarity(nmf_tfidf, data['text_1_norm'], data['text_2_norm'], 100, vect=tfidf)

In [95]:
result.update(pd.DataFrame({'nmf_count': nmf_, 
                                'nmf_tfidf': r_nmf_tfidf}))

In [99]:
cross_val_score(LogisticRegression(class_weight='balanced'), result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.44540218826781314

In [97]:
cross_val_score(RandomForestClassifier(n_estimators=500, class_weight='balanced'), result, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.46988984775800591