In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split

#dict_emb = 'ru'
dict_emb = 'ru-eng'

num_true = 1000
num_eng = 5000
test_size = 0.2

def prepare(num_true, num_eng, test_size):
    X_eng_tr, X_eng_te, y_eng_tr, y_eng_te = train_test_split(X_eng, y_eng, train_size=num_eng)
    X_true_s, _, y_true_s, _ = train_test_split(X_true, y_true, train_size=num_true)
    
    X_rus, X_test, y_rus, y_test = train_test_split(np.vstack([X_fake, X_true_s]),
                                                    np.concatenate([y_fake, y_true_s]), 
                                                    test_size=test_size)
    
    X_train = np.vstack([X_eng_tr, X_rus])
    y_train = np.concatenate([y_eng_tr, y_rus])
    return X_train, X_test, X_eng_te, y_train, y_test, y_eng_te


def sim(x, y):
    return np.sum(x * y, axis=1) / (np.linalg.norm(x, axis=1) * np.linalg.norm(y, axis=1) +1e-5)

def dist(x, y):
    return np.linalg.norm(x - y, axis=1)

In [8]:
X_fake = np.nan_to_num(np.load(dict_emb + '/fake/text_w2v.npy'))
X_true = np.nan_to_num(np.load(dict_emb + '/true/text_w2v.npy'))
X_eng = np.nan_to_num(np.load(dict_emb + '/eng/text_w2v.npy'))

X_fake_title = np.nan_to_num(np.load(dict_emb + '/fake/title_w2v.npy'))
X_true_title = np.nan_to_num(np.load(dict_emb + '/true/title_w2v.npy'))
X_eng_title = np.nan_to_num(np.load(dict_emb + '/eng/title_w2v.npy'))

y_fake = np.load(dict_emb + '/fake/y.npy')
y_true = np.load(dict_emb + '/true/y.npy')
y_eng = np.load(dict_emb + '/eng/y.npy')

In [9]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

def get_score():
    f1, acc = [np.zeros(5), np.zeros(5)], [np.zeros(5), np.zeros(5)]

    for i in range(5):
        X_train, X_test, X_eng_te, y_train, y_test, y_eng_te = prepare(num_true, num_eng, test_size)
        clf = LinearSVC()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_eng_pr = clf.predict(X_eng_te)
        f1[0][i], f1[1][i] = f1_score(y_test, y_pred), f1_score(y_eng_te, y_eng_pr)
        acc[0][i], acc[1][i] = accuracy_score(y_test, y_pred), accuracy_score(y_eng_te, y_eng_pr)
    
    print('ru\t f1:', np.mean(f1[0]), 'acc', np.mean(acc[0]))
    print('eng\t f1:', np.mean(f1[1]), 'acc', np.mean(acc[1]))
    
get_score()



ru	 f1: 0.7256993287840949 acc 0.7286103542234332
eng	 f1: 0.7921776654350701 acc 0.7938899496615172


In [10]:
sim_fake, dist_fake = sim(X_fake, X_fake_title)[:, None], dist(X_fake, X_fake_title)[:, None]
sim_true, dist_true = sim(X_true, X_true_title)[:, None], dist(X_true, X_true_title)[:, None]
sim_eng, dist_eng = sim(X_eng, X_eng_title)[:, None], dist(X_eng, X_eng_title)[:, None]

In [11]:
X_fake = np.hstack([X_fake, sim_fake, dist_fake])
X_true = np.hstack([X_true, sim_true, dist_true])
X_eng = np.hstack([X_eng, sim_eng, dist_eng])

get_score()



ru	 f1: 0.739057554062834 acc 0.7237057220708447
eng	 f1: 0.79554531721616 acc 0.795868772782503


In [12]:
X_fake_lda = np.load(dict_emb + '/fake/text_lda.npy')
X_true_lda = np.load(dict_emb + '/true/text_lda.npy')
X_eng_lda = np.load(dict_emb + '/eng/text_lda.npy')

X_fake = np.hstack([X_fake, X_fake_lda])
X_true = np.hstack([X_true, X_true_lda])
X_eng = np.hstack([X_eng, X_eng_lda])

get_score()



ru	 f1: 0.8319154022871336 acc 0.8435967302452315
eng	 f1: 0.8092996393311438 acc 0.8157958687727824
