In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('../data/fake_or_real_news.csv')
texts = df.text.values #pd.Series -> np.ndarray
#titles = df.title.values #pd.Series -> np.ndarray
labels = df.label.values #pd.Series -> np.ndarray
for i in range(len(labels)):
    labels[i] = 1 if labels[i] == 'REAL' else 0
labels = labels.astype(int)
# view the first 5 rows 
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,title_vectors
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,[ 1.1533764e-02 4.2144405e-03 1.9692603e-02 ...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,[ 0.11267698 0.02518966 -0.00212591 0.021095...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,[ 0.04253004 0.04300297 0.01848392 0.048672...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,[ 0.10801624 0.11583211 0.02874823 0.061732...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,[ 1.69016439e-02 7.13498285e-03 -7.81233795e-...


# Remove news with no text

In [2]:
text_list = texts.tolist()
label_list = labels.tolist()
idx = -1
count = 0
for text in texts:
    idx = idx + 1
    if len(text) == 1:
        count = count + 1
        text_list.remove(text)
        label_list.remove(label_list[idx])
        idx = idx - 1
labels = np.array(label_list)
texts = np.array(text_list)

print("There are {} texts been removed.".format(count))

There are 36 texts been removed.


In [3]:
text_list = texts.tolist()
label_list = labels.tolist()
idx = -1
count = 0
for text in texts:
    idx = idx + 1
    if len(text) == 1:
        count = count + 1

print(count)

0


# Preprocess Data

In [4]:
from preprocess_data import *

# “_token”is divided by text, “_token_corpus”combines all the texts,so that it only has one dimension
word_token = [nltk.word_tokenize(text) for text in texts]
word_token_corpus = []
for i in range(len(word_token)):
    word_token_corpus.extend(word_token[i])
print(len(word_token_corpus))

sentence_token = [nltk.sent_tokenize(text) for text in texts]
sentence_token_corpus = []
for i in range(len(sentence_token)):
    sentence_token_corpus.extend(sentence_token[i])
print(len(sentence_token_corpus))

#title_token = tokenize(titles)

n = 0 #arbitrary pick
#print('Example review:\n   Raw: {} \n\n   Tokenized: {}'.format(titles[n], [i for i in title_token[n]]))
print('Example review:\n   Raw: {} \n\n   Tokenized: {}'.format(texts[n], [i for i in word_token[n]]))


5708383
217376
Example review:
   Raw: Daniel Greenfield, a Shillman Journalism Fellow at the Freedom Center, is a New York writer focusing on radical Islam. 
In the final stretch of the election, Hillary Rodham Clinton has gone to war with the FBI. 
The word “unprecedented” has been thrown around so often this election that it ought to be retired. But it’s still unprecedented for the nominee of a major political party to go war with the FBI. 
But that’s exactly what Hillary and her people have done. Coma patients just waking up now and watching an hour of CNN from their hospital beds would assume that FBI Director James Comey is Hillary’s opponent in this election. 
The FBI is under attack by everyone from Obama to CNN. Hillary’s people have circulated a letter attacking Comey. There are currently more media hit pieces lambasting him than targeting Trump. It wouldn’t be too surprising if the Clintons or their allies were to start running attack ads against the FBI. 
The FBI’s leadersh

In [5]:
print(len(word_token))
print(len(sentence_token))
print(len(labels))
print(sentence_token[0])


6299
6299
6299


In [6]:
from collections import Counter

#Note that we convert all tokens to lower case, otherwise words like *The* and *the* are different tokens.
text_counter = Counter(token.lower() for sentence in word_token for token in sentence)
top10 = text_counter.most_common()[:10]
for i, t in enumerate(top10):
    print('{:>2}.{:>5}  freq: {:>7}'.format(i+1, t[0], t[1]))

 1.  the  freq:  290192
 2.    ,  freq:  255957
 3.    .  freq:  204416
 4.   to  freq:  139223
 5.   of  freq:  129553
 6.  and  freq:  119235
 7.    a  freq:  107808
 8.   in  freq:   97993
 9. that  freq:   72278
10.    ’  freq:   58461


## Remove punctuations and stopwords, lower-case text, implement stemming and lemmatization

In [7]:
import nltk
from nltk.corpus import stopwords
from string import punctuation
from itertools import chain

english_stemmer = nltk.stem.SnowballStemmer('english')
english_lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
punct = punctuation

def lemmatize_tokens(tokens, lemmatizer):
    lemmatized = []
    for doc in tokens:
        lemmatized.append([lemmatizer.lemmatize(token) for token in doc])
    return lemmatized

def stem_tokens(tokens, stemmer):
    stemmed = []
    for doc in tokens:
        stemmed.append([stemmer.stem(token) for token in doc])
    return stemmed

def clean_text(tokenized_list,lemmatize=True,stem=True):
    tokens = []
    for doc in tokenized_list:
        tokens.append([token.lower() for token in doc if token.lower() not in chain(punct, stopwords)])
    tokens_cleaned = tokens
    if lemmatize:
        tokens_cleaned = lemmatize_tokens(tokens_cleaned, english_lemmatizer)
    if stem:
        tokens_cleaned = stem_tokens(tokens_cleaned, english_stemmer)
      
    return tokens_cleaned

text_cleaned = clean_text(word_token)

In [8]:
print(len(text_cleaned))
print(text_cleaned[0])

6299
['daniel', 'greenfield', 'shillman', 'journal', 'fellow', 'freedom', 'center', 'new', 'york', 'writer', 'focus', 'radic', 'islam', 'final', 'stretch', 'elect', 'hillari', 'rodham', 'clinton', 'gone', 'war', 'fbi', 'word', '“', 'unpreced', '”', 'thrown', 'around', 'often', 'elect', 'ought', 'retir', '’', 'still', 'unpreced', 'nomine', 'major', 'polit', 'parti', 'go', 'war', 'fbi', '’', 'exact', 'hillari', 'peopl', 'done', 'coma', 'patient', 'wake', 'watch', 'hour', 'cnn', 'hospit', 'bed', 'would', 'assum', 'fbi', 'director', 'jame', 'comey', 'hillari', '’', 'oppon', 'elect', 'fbi', 'attack', 'everyon', 'obama', 'cnn', 'hillari', '’', 'peopl', 'circul', 'letter', 'attack', 'comey', 'current', 'medium', 'hit', 'piec', 'lambast', 'target', 'trump', '’', 'surpris', 'clinton', 'alli', 'start', 'run', 'attack', 'ad', 'fbi', 'fbi', '’', 'leadership', 'warn', 'entir', 'left-w', 'establish', 'form', 'lynch', 'mob', 'continu', 'go', 'hillari', 'fbi', '’', 'credibl', 'attack', 'medium', 'demo

In [9]:
#Note that we convert all tokens to lower case, otherwise words like *The* and *the* are different tokens.
text_counter = Counter(token.lower() for sentence in text_cleaned for token in sentence)
top10 = text_counter.most_common()[:10]
for i, t in enumerate(top10):
    print('{:>2}.{:>5}  freq: {:>7}'.format(i+1, t[0], t[1]))

 1.    ’  freq:   58461
 2.    “  freq:   34978
 3.    ”  freq:   34368
 4.trump  freq:   21993
 5. said  freq:   21162
 6.   ''  freq:   19960
 7.   's  freq:   19813
 8.   ``  freq:   19684
 9.clinton  freq:   17788
10.state  freq:   15374


## Uni-grams, bi-grams and tri-grams are created out of the list of tokens

In [41]:
# "_corpus" combines all the articles together so this list only has one dimension 

from ngram import *

join_str = "_"
unigram = []
unigram_corpus = []
bigram = []
bigram_corpus = []
trigram = []
trigram_corpus = []
for doc in text_cleaned:
    unigram.append(getUnigram(doc))
    unigram_corpus.extend(getUnigram(doc))
    bigram.append(getBigram(doc, join_str))
    bigram_corpus.extend(getBigram(doc, join_str))
    trigram.append(getTrigram(doc, join_str))
    trigram_corpus.extend(getTrigram(doc, join_str))

print(unigram[0])
print(bigram[0])
print(trigram[0])

['daniel', 'greenfield', 'shillman', 'journal', 'fellow', 'freedom', 'center', 'new', 'york', 'writer', 'focus', 'radic', 'islam', 'final', 'stretch', 'elect', 'hillari', 'rodham', 'clinton', 'gone', 'war', 'fbi', 'word', '“', 'unpreced', '”', 'thrown', 'around', 'often', 'elect', 'ought', 'retir', '’', 'still', 'unpreced', 'nomine', 'major', 'polit', 'parti', 'go', 'war', 'fbi', '’', 'exact', 'hillari', 'peopl', 'done', 'coma', 'patient', 'wake', 'watch', 'hour', 'cnn', 'hospit', 'bed', 'would', 'assum', 'fbi', 'director', 'jame', 'comey', 'hillari', '’', 'oppon', 'elect', 'fbi', 'attack', 'everyon', 'obama', 'cnn', 'hillari', '’', 'peopl', 'circul', 'letter', 'attack', 'comey', 'current', 'media', 'hit', 'piec', 'lambast', 'target', 'trump', '’', 'surpris', 'clinton', 'alli', 'start', 'run', 'attack', 'ad', 'fbi', 'fbi', '’', 'leadership', 'warn', 'entir', 'left-w', 'establish', 'form', 'lynch', 'mob', 'continu', 'go', 'hillari', 'fbi', '’', 'credibl', 'attack', 'media', 'democrat', 

# Count Feature Generater

# TF-IDF Feature Generater

In [32]:
# Counnt words' frequency

countlist = []
for i in range(len(text_cleaned)):
    count = Counter(text_cleaned[i])
    countlist.append(count)
print(countlist[0])

Counter({'’': 32, 'fbi': 31, 'hillari': 18, 'clinton': 18, 'comey': 11, 'investig': 8, 'email': 8, 'time': 6, 'campaign': 6, 'fear': 6, 'elect': 5, 'war': 5, 'go': 5, 'peopl': 5, 'attack': 5, 'scandal': 5, 'gone': 4, '“': 4, 'unpreced': 4, '”': 4, 'around': 4, 'jame': 4, 'media': 4, 'realli': 4, 'act': 4, 'one': 4, 'new': 3, 'focus': 3, 'polit': 3, 'director': 3, 'obama': 3, 'warn': 3, 'democrat': 3, 'claim': 3, 'accus': 3, 'kgb': 3, 'violat': 3, 'hatch': 3, 'awkward': 3, 'way': 3, 'tri': 3, 'victori': 3, 'associ': 3, 'decid': 3, 'old': 3, 'assault': 3, 'fight': 3, 'panick': 3, 'desper': 3, 'afraid': 3, 'bigger': 3, 'setup': 3, 'whatev': 3, 'york': 2, 'final': 2, 'stretch': 2, 'still': 2, 'nomine': 2, 'cnn': 2, 'everyon': 2, 'letter': 2, 'trump': 2, 'alli': 2, 'leadership': 2, 'credibl': 2, 'preemptiv': 2, 'foundat': 2, 'agent': 2, 'doj': 2, 'public': 2, 'hoover': 2, 'bizarr': 2, 'appear': 2, 'hous': 2, 'republican': 2, 'right': 2, 'conspiraci': 2, 'countless': 2, 'procedur': 2, 'know'

In [33]:
# word可以通过count得到，count可以通过countlist得到
# count[word]可以得到每个单词的词频， sum(count.values())得到整个句子的单词总数
def tf(word, count):
    return count[word] / sum(count.values())

# 统计的是含有该单词的句子数
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)

# len(count_list)是指句子的总数，n_containing(word, count_list)是指含有该单词的句子的总数，加1是为了防止分母为0
def idf(word, count_list):
    return math.log(len(count_list) / (1 + n_containing(word, count_list)))

# 将tf和idf相乘
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

In [37]:
import math

tf_idf = []

for i, count in enumerate(countlist[0:10]):
    scores = {word: tfidf(word, count, countlist) for word in count}
    tf_idf.append(scores)
    
print("Top words in document 1")
sorted_words = sorted(tf_idf[0].items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:]:
    print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: fbi, TF-IDF: 0.10067
	Word: comey, TF-IDF: 0.04956
	Word: kgb, TF-IDF: 0.02723
	Word: hillari, TF-IDF: 0.02633
	Word: clinton, TF-IDF: 0.02455
	Word: setup, TF-IDF: 0.0239
	Word: panick, TF-IDF: 0.02219
	Word: investig, TF-IDF: 0.02075
	Word: scandal, TF-IDF: 0.01998
	Word: clintonworld, TF-IDF: 0.01916
	Word: email, TF-IDF: 0.0188
	Word: ’, TF-IDF: 0.01866
	Word: awkward, TF-IDF: 0.0185
	Word: fear, TF-IDF: 0.01842
	Word: unpreced, TF-IDF: 0.01822
	Word: hatch, TF-IDF: 0.01809
	Word: hubri, TF-IDF: 0.01666
	Word: preemptiv, TF-IDF: 0.01594
	Word: afraid, TF-IDF: 0.01521
	Word: gone, TF-IDF: 0.01499
	Word: hoover, TF-IDF: 0.01479
	Word: jame, TF-IDF: 0.01456
	Word: desper, TF-IDF: 0.01328
	Word: bigger, TF-IDF: 0.01322
	Word: doj, TF-IDF: 0.0122
	Word: countless, TF-IDF: 0.01179
	Word: assault, TF-IDF: 0.01156
	Word: bizarr, TF-IDF: 0.01153
	Word: spinmeist, TF-IDF: 0.01135
	Word: violat, TF-IDF: 0.01115
	Word: whatev, TF-IDF: 0.01113
	Word: war, TF-IDF: 

In [22]:
"""
我想做的是把句子里所有的单词用word2vec模型训练得到词向量，然后把这些向量乘以我们之前得到的tfidf值，
再把它们加起来除以单词数，就可以得到句子向量。也就是结合tfidf给单词加上一个权重，评判一个单词的重要程度。
"""

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec= TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=2,stop_words='english')
tfidf_vec.fit_transform(texts)
vocabulary = tfidf_vec.get_feature_names()
print(len(vocabulary))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


476855


# Word2Vec Feature Generater

In [38]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
from gensim import models

# Training word2vec model on already cleaned text. This may take a few minutes.
word2vec = models.Word2Vec(text_cleaned,
                        size = 300,
                        window = 5,
                        min_count = 1,                      # set "min_count" = 1 to make sure every word corresponds to a vector, in case something go wrong in the Naive Doc2Vec process
                        sg = 0,
                        alpha = 0.025,                      # if I set alpha = 0.01, performance will be much worse
                        iter=10,
                        batch_words = 10000)





In [None]:
import numpy as np
# loads 300x1 word vectors from file.
def load_bin_vec(fname, vocab):
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split()) # 3000000 300
        binary_len = np.dtype('float32').itemsize * layer1_size # 1200
        for line in range(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            if word in vocab:
                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)
    return word_vecs

# add random vectors of unknown words which are not in pre-trained vector file.
# if pre-trained vectors are not used, then initialize all words in vocab with random value.
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25, 0.25, k)

text_cleaned_corpus = []
for i in range(len(text_cleaned)):
    text_cleaned_corpus.extend(text_cleaned[i])
    
vectors_file =  './GoogleNews-vectors-negative300.bin'
vocab= text_cleaned_corpus

word2vec = load_bin_vec(vectors_file, vocab)  # pre-trained vectors
add_unknown_words(word2vec, vocab)


# Sentiment Feature Generator

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np

sid = SentimentIntensityAnalyzer()
def compute_sentiment(sentences):
    result = []
    for sentence in sentences:
        vs = sid.polarity_scores(sentence)
        score = vs[]
        result.append(vs)
    
    result_np = np.array(result).mean(axis=0)
    result = result_np.tolist()
    return result

df = pd.concat([df, df['headline_sents'].apply(lambda x: compute_sentiment(x))], axis=1)





# Doc2Vec Feature Generater

In [10]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import reuters

tokenized_docs = text_cleaned

print('tokenized_docs:\n',tokenized_docs[0])

# Convert tokenized documents to TaggedDocuments
tagged_docs = [TaggedDocument(doc, tags=[idx]) for idx, doc in enumerate(tokenized_docs)]

print('tagged_docs:\n',tagged_docs[0])

# Create and train the doc2vec model and fortify it with pre-trained Word2Vec models
pretrained_emb = 'GoogleNews-vectors-negative300.bin' 
doc2vec = Doc2Vec(vector_size=300, window=5, min_count=5, dm = 1, epochs=10, pretrained_emb=pretrained_emb)

# Build the word2vec model from the corpus and  
doc2vec.build_vocab(tagged_docs) 

# Train the models
doc2vec.train(tagged_docs, epochs=10, total_examples=doc2vec.corpus_count)



tokenized_docs:
 ['daniel', 'greenfield', 'shillman', 'journal', 'fellow', 'freedom', 'center', 'new', 'york', 'writer', 'focus', 'radic', 'islam', 'final', 'stretch', 'elect', 'hillari', 'rodham', 'clinton', 'gone', 'war', 'fbi', 'word', '“', 'unpreced', '”', 'thrown', 'around', 'often', 'elect', 'ought', 'retir', '’', 'still', 'unpreced', 'nomine', 'major', 'polit', 'parti', 'go', 'war', 'fbi', '’', 'exact', 'hillari', 'peopl', 'done', 'coma', 'patient', 'wake', 'watch', 'hour', 'cnn', 'hospit', 'bed', 'would', 'assum', 'fbi', 'director', 'jame', 'comey', 'hillari', '’', 'oppon', 'elect', 'fbi', 'attack', 'everyon', 'obama', 'cnn', 'hillari', '’', 'peopl', 'circul', 'letter', 'attack', 'comey', 'current', 'medium', 'hit', 'piec', 'lambast', 'target', 'trump', '’', 'surpris', 'clinton', 'alli', 'start', 'run', 'attack', 'ad', 'fbi', 'fbi', '’', 'leadership', 'warn', 'entir', 'left-w', 'establish', 'form', 'lynch', 'mob', 'continu', 'go', 'hillari', 'fbi', '’', 'credibl', 'attack', 'me

In [16]:
doc2vec.infer_vector(text_cleaned[1])

array([ 0.07077232, -0.31755018,  0.2779891 , -0.03377091, -0.36201048,
        0.00343469, -0.20409313,  0.10761351, -0.02697085, -0.3241465 ,
        0.5139417 , -0.02823218, -0.38517526, -0.315487  ,  0.01282835,
       -0.13524926, -0.04464984,  0.11075468,  0.16916813, -0.00628345,
        0.20759624,  0.0963446 ,  0.4673685 , -0.54110324, -0.1448787 ,
       -0.15828173,  0.31879193, -0.04130795, -0.31499118,  0.00587979,
        0.3361764 ,  0.34574395, -0.02981369,  0.5409379 , -0.3622694 ,
        0.58672017, -0.47682056,  0.04214418, -0.2911609 ,  0.675423  ,
        0.7465637 ,  0.12917386, -0.05853115,  0.58302987, -0.06304032,
       -0.346226  ,  0.3399946 ,  0.18615772, -0.7023198 , -0.25676477,
        0.38956323,  0.12842172,  0.13831459, -0.11237689,  0.02982863,
       -0.43031287,  0.16103123, -0.1236412 , -0.37977496,  0.17551635,
       -0.5227135 ,  0.28007472,  0.1081889 , -0.17923374, -0.0622146 ,
       -0.2669949 , -0.30767444, -0.06587666, -0.48056823,  0.60

# Classification

## Naive Doc2Vec

In [13]:
import numpy as np

naive_doc2vec = np.zeros((len(text_cleaned),300))
idx = -1
for text in text_cleaned:
    idx = idx + 1
    num_words = len(text)
    for word in text:
        word_vector = word2vec[word]
        naive_doc2vec[idx,:] = naive_doc2vec[idx,:] + word_vector
    naive_doc2vec[idx,:] = naive_doc2vec[idx,:] / num_words

  if __name__ == '__main__':


## Doc2Vec

In [12]:
doc2vec_data = np.zeros((len(text_cleaned),300))
for i in range (len(text_cleaned)):
    doc2vec_data[i] = doc2vec.infer_vector(text_cleaned[i])

print(doc2vec_data[0])

[ 0.51819539 -0.12092809 -0.20348667  0.76816881  0.0138858   0.4003658
  0.89415216 -0.30314958  0.25177854 -0.34885514  0.35436589 -0.48367032
  0.21342364 -0.90782326 -0.48038501 -0.53868455 -0.4379901   1.04964387
  0.41214576 -0.0806098  -0.12632394  0.20721331  0.95239371  0.17302765
  1.18847191 -0.35850468  0.91666836 -0.29147929  0.53213376 -0.36548105
 -0.08059768  0.27350482  0.40128538  1.11018956 -0.66491938  0.88129199
 -0.67434478 -0.01406762  1.11803782  0.70735598  0.11120999  0.11006031
 -0.43761513 -0.76334983 -1.0623982  -0.10690411  0.61261582  0.05220818
 -0.15658911 -0.97246623  0.24973607  0.52804804  0.90718174  0.71839482
  0.04299706 -0.57961375 -0.1190193  -0.29958636  0.10201645  1.33187795
 -0.31167826  0.31917936  0.45413998  0.44677228 -0.90603638  0.15214166
 -0.22412725 -0.89174181  0.4214083   0.64495748 -0.07403875  0.11315721
  0.41004667  0.24274172  0.13460805 -1.23426855 -1.20298696 -0.5195514
 -0.22273576 -0.79252225  0.29600745 -1.32290065 -0.1

## Merge Features

In [13]:
data = doc2vec_data 
print(len(data))

6299


In [14]:
from sklearn.model_selection import train_test_split

# split data into train and test sets
seed = 42
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=seed)
print(X_test.shape)

(2079, 300)


In [15]:
# instantiate a SVM regression model, and fit with X and y
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score


model = LogisticRegression()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(y_pred[0:10])
# check the accuracy on the training set
print(confusion_matrix(y_true=y_test, y_pred=y_pred))
score = model.score(X_test, y_test)

p = precision_score(y_test, y_pred, average='binary')
r = recall_score(y_test, y_pred, average='binary')
f1score = f1_score(y_test, y_pred, average='binary')
print('accuracy:',score)
print('precision:',p)
print('recall:',r)
print('f1score:',f1score)

[0 1 0 1 0 0 0 1 0 1]
[[500 517]
 [518 544]]
accuracy: 0.5021645021645021
precision: 0.5127238454288408
recall: 0.512241054613936
f1score: 0.5124823363165333
