In [1]:
import pandas as pd
import numpy as np
import os


fake_data = pd.read_csv('Panorama/metatable.csv', sep='\t')

data = []
for i in range(fake_data.shape[0]):
    file = open('Panorama/texts/'+str(i)+'.txt')
    data.append(' '.join(file.readlines()))
fake_data = pd.concat([fake_data, pd.DataFrame(data, columns=['text'])], axis=1)
fake_data['text'] = fake_data['text'].str.lower().str.replace('\s', ' ').\
                                      str.replace(r'[^a-zа-я0-9 ]', '')
fake_data['title'] = fake_data['title'].str.lower().str.replace('\s', ' ').\
                                      str.replace(r'[^a-zа-я0-9 ]', '')
fake_data = fake_data[['text', 'title', 'date', 'link']]

def get_data(news):
    data = []
    meta_data = pd.read_csv(news + '/newmetadata.csv', sep='\t').drop_duplicates()
    for name in os.listdir(news + '/texts'):
        file = open(news + '/texts/'+name)
        data.append(' '.join(file.readlines()))
    
    data = pd.DataFrame({'text': data, 'textid': os.listdir(news + '/texts')})
    meta_data = pd.concat([meta_data.sort_values(['textid']).reset_index(drop=True),
                               data.sort_values(['textid']).reset_index(drop=True)['text']], axis=1)
    meta_data['text'] = meta_data['text'].str.lower().str.replace('\s', ' ').str.replace(r'[^a-zа-я0-9 ]', '')
    meta_data['textname'] = meta_data['textname'].str.lower().str.replace('\s', ' ').str.replace(r'[^a-zа-я0-9 ]', '')
    return meta_data[['text', 'textname', 'date', 'source']].sort_values(['date']).reset_index(drop=True).\
                                                             rename({'textname':'title'}, axis=1)

interfax_data = get_data('interfax')
kp_data = get_data('KP')
lenta_data = get_data('Lenta')

eng_data = pd.read_csv('fake-news/train.csv').drop(columns=['id'])
eng_data['text'] = eng_data['text'].str.lower().str.replace('\s', ' ').str.replace(r'[^a-zа-я0-9 ]', '')
eng_data['title'] = eng_data['title'].str.lower().str.replace('\s', ' ').str.replace(r'[^a-zа-я0-9 ]', '')
eng_data.drop(index=eng_data.index[eng_data.text.isnull()], inplace=True)

In [2]:
y_fake = np.ones(fake_data.shape[0])
np.save('ru-eng/fake/y.npy', y_fake)

y_kp = np.zeros(kp_data.shape[0])
y_lenta = np.zeros(lenta_data.shape[0])
y_interfax = np.zeros(interfax_data.shape[0])
y_true = np.concatenate([y_interfax, y_lenta, y_kp])
np.save('ru-eng/true/y.npy', y_true)

y_eng = eng_data.label
np.save('ru-eng/eng/y.npy', y_eng)

In [4]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


model = Doc2Vec.load("model/doc2vec_eng_rus.bin")
n_d2v = 64


def create_d2v(data, model, name):
    texts = [str(data.iloc[i][name]).split() for i in range(len(data))]
    X_d2v = np.zeros((len(data), n_d2v))
    for i in range(len(data)):
        X_d2v[i] = model.infer_vector(texts[i])
    return X_d2v

def save_emb(func, name):
    X_fake = func(fake_data, model, 'text')
    np.save('ru-eng/fake/text_' + name + '.npy', X_fake)

    X_kp = func(kp_data, model, 'text')
    X_lenta = func(lenta_data, model, 'text')
    X_interfax = func(interfax_data, model, 'text')
    X_true = np.vstack([X_interfax, X_lenta, X_kp])
    np.save('ru-eng/true/text_' + name + '.npy', X_true)

    X_eng = func(eng_data, model, 'text')
    np.save('ru-eng/eng/text_' + name + '.npy', X_eng)


    X_fake = func(fake_data, model, 'title')
    np.save('ru-eng/fake/title_' + name + '.npy', X_fake)

    X_kp = func(kp_data, model, 'title')
    X_lenta = func(lenta_data, model, 'title')
    X_interfax = func(interfax_data, model, 'title')
    X_true = np.vstack([X_interfax, X_lenta, X_kp])
    np.save('ru-eng/true/title_' + name + '.npy', X_true)

    X_eng = func(eng_data, model, 'title')
    np.save('ru-eng/eng/title_' + name + '.npy', X_eng)

save_emb(create_d2v, 'd2v')

In [5]:
from gensim.models import Word2Vec


model = Word2Vec.load("model/word2vec_eng_rus.bin")
n_w2v = 64

def create_w2v(data, model, name):
    X = np.zeros((len(data), n_w2v))
    texts = [str(data.iloc[i][name]).split() for i in range(len(data))]
    for i in range(len(data)):
        k = 0
        for j in range(len(texts[i])):
            if texts[i][j] in model.wv.vocab:
                X[i] += model[texts[i][j]]
                k += 1
        X[i] /= k
    return X
        
save_emb(create_w2v, 'w2v')

  
  app.launch_new_instance()


In [6]:
from gensim.models import LdaModel
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary


model = LdaModel.load("model/lda_eng_rus.bin")
n_tm = 64

def create_lda(data, model, name):
    texts = [str(data.iloc[i][name]).split() for i in range(len(data))]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    X_tm = np.zeros((len(data), n_tm))
    for i in range(len(data)):
        for elem in model[corpus[i]]:
            X_tm[i][elem[0]] = elem[1]
            
    return X_tm

save_emb(create_lda, 'lda')