In [None]:
!pip install googledrivedownloader;

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from google_drive_downloader import GoogleDriveDownloader as gdd

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test



In [None]:
fake = pd.read_csv('Fake.csv', delimiter = ',')
fake['label']= 0

print("# fake news = " + str(len(fake)) + '\n')

unique_texts = fake['text'].value_counts().sum()
empty_texts = fake[fake['text'] == ' '].append(fake[fake['text'] == '  ']).value_counts().sum()
print("# unique texts = " + str(unique_texts))
print("# of empty texts = " + str(empty_texts))
print("Total texts = " + str(unique_texts - empty_texts) + '\n')

unique_titles = fake['title'].value_counts().sum()
empty_titles = fake[fake['title'] == ' '].append(fake[fake['title'] == '  ']).value_counts().sum()
print("# unique titles = " + str(unique_titles))
print("# of empty titles = " + str(empty_titles))
print("Total texts = " + str(unique_titles - empty_titles))

fake = fake.drop(columns=['subject','date'])
fake_trn, fake_val, fake_tst = train_validate_test_split(fake, train_percent=0.7, validate_percent=0.2, seed=1)

true = pd.read_csv('True.csv', delimiter = ',')
true['label']= 1

print("# true news = " + str(len(true)) + '\n')

unique_texts = true['text'].value_counts().sum()
empty_texts = true[true['text'] == ' '].append(true[true['text'] == '  ']).value_counts().sum()
print("# unique texts = " + str(unique_texts))
print("# of empty texts = " + str(empty_texts))
print("Total texts = " + str(unique_texts - empty_texts) + '\n')

unique_titles = true['title'].value_counts().sum()
empty_titles = true[true['title'] == ' '].append(true[true['title'] == '  ']).value_counts().sum()
print("# unique titles = " + str(unique_titles))
print("# of empty titles = " + str(empty_titles))
print("Total texts = " + str(unique_titles - empty_titles))

true = true.drop(columns=['subject','date'])
true_trn, true_val, true_tst = train_validate_test_split(true, train_percent=0.7, validate_percent=0.2, seed=1)

df_trn, df_val, df_tst = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

df_trn = true_trn.append(fake_trn).sample(frac=1).reset_index(drop=True)
df_val = true_val.append(fake_val).sample(frac=1).reset_index(drop=True)
df_tst = true_tst.append(fake_tst).sample(frac=1).reset_index(drop=True)

print("Training set: " + str(len(df_trn)))
print("Validation set: " + str(len(df_val)))
print("Test set: " + str(len(df_tst)))


df_trn.drop(columns=['text']).to_csv('./preprocessed_data/trn_title.csv', header=False, index=False)
df_val.drop(columns=['text']).to_csv('./preprocessed_data/val_title.csv', header=False, index=False)
df_tst.drop(columns=['text']).to_csv('./preprocessed_data/tst_title.csv', header=False, index=False)

df_trn.drop(columns=['title']).to_csv('./preprocessed_data/trn_text.csv', header=False, index=False)
df_val.drop(columns=['title']).to_csv('./preprocessed_data/val_text.csv', header=False, index=False)
df_tst.drop(columns=['title']).to_csv('./preprocessed_data/tst_text.csv', header=False, index=False)

# fake news = 23481

# unique texts = 23481
# of empty texts = 630
Total texts = 22851

# unique titles = 23481
# of empty titles = 0
Total texts = 23481
# true news = 21417

# unique texts = 21417
# of empty texts = 1
Total texts = 21416

# unique titles = 21417
# of empty titles = 0
Total texts = 21417
Training set: 31427
Validation set: 8979
Test set: 4492


In [None]:
def tokenize(sentences):

    tokenized_senteces = list()
    for s, sent in enumerate(sentences):

        remove_punctuation = RegexpTokenizer(r"\w+")
        tokenized_sent = remove_punctuation.tokenize(sent)

        for i, t in enumerate(tokenized_sent):
            if t.isnumeric():
                tokenized_sent[i] = "<num>"

        tokenized_senteces.append(tokenized_sent)

    return tokenized_senteces


def token_filter(tokenized_sentences, thresh=5):
    
    words = list()
    for t_sent in tokenized_sentences:
        for word in t_sent:
            words.append(word)

    vocab = Vocabulary(words, unk_cutoff=thresh)

    filtered_sentences = list()

    for t, tokenized_sent in enumerate(tokenized_sentences):
        filtered_sent = list()
        for word in tokenized_sent:
            if vocab.lookup(word) == '<UNK>':
                filtered_sent.append('<unk>')
            else: 
                filtered_sent.append(word)
        filtered_sentences.append(filtered_sent)

    return filtered_sentences

In [None]:
!pip install fasttext
import fasttext
import pandas as pd

def Skipgram(filtered_sentences, ws=3, dim=50):
    model = fasttext.train_unsupervised(filtered_sentences, model='skipgram', ws=ws, dim=dim, neg=5)
    vocab_dict = {} 
    vocab_dict = {word:idx for (idx, word) in enumerate(model.get_words())}
    return model.get_output_matrix(), vocab_dict


trn = pd.read_csv('./preprocessed_data/trn_title.csv', delimiter = ',', names=['title','label'])
tst = pd.read_csv('./preprocessed_data/tst_title.csv', delimiter = ',', names=['title','label'])






In [None]:
!pip install nltk==3.6

import math
import pickle
import urllib.request
from tqdm import tqdm
from os.path import isfile

import nltk
nltk.download("punkt")
from nltk import RegexpTokenizer
from nltk.lm import Vocabulary

trn_sentences = tokenize(list(trn.title))
trn_filtered_sentences = token_filter(trn_sentences,5)

tst_sentences = tokenize(list(tst.title))
tst_filtered_sentences = token_filter(tst_sentences,5)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
with open('./preprocessed_data/sg_train.txt', 'w') as f:
    for title in trn_filtered_sentences:
        for w in title: 
          f.write("%s " % w)
        f.write("\n")

with open('./preprocessed_data/sg_test.txt', 'w') as f:
    for title in tst_filtered_sentences:
        for w in title: 
          f.write("%s " % w)
        f.write("\n")






trn_embedings, vocab_trn = Skipgram('./preprocessed_data/sg_train.txt')




f = open("./preprocessed_embeddings/sg_trn.pkl","wb")
pickle.dump(trn_embedings,f)
f.close()


tst_embedings, vocab_tst = Skipgram('./preprocessed_data/sg_test.txt')

f = open("./preprocessed_embeddings/sg_tst.pkl","wb")
pickle.dump(tst_embedings,f)
f.close()


