In [27]:
!pip install googledrivedownloader;



In [28]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from google_drive_downloader import GoogleDriveDownloader as gdd

# Common functions

In [2]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

# Procesing data

In [5]:
fake = pd.read_csv('Fake.csv', delimiter = ',')
fake['label']= 0

In [6]:
print("# fake news = " + str(len(fake)) + '\n')

unique_texts = fake['text'].value_counts().sum()
empty_texts = fake[fake['text'] == ' '].append(fake[fake['text'] == '  ']).value_counts().sum()
print("# unique texts = " + str(unique_texts))
print("# of empty texts = " + str(empty_texts))
print("Total texts = " + str(unique_texts - empty_texts) + '\n')

unique_titles = fake['title'].value_counts().sum()
empty_titles = fake[fake['title'] == ' '].append(fake[fake['title'] == '  ']).value_counts().sum()
print("# unique titles = " + str(unique_titles))
print("# of empty titles = " + str(empty_titles))
print("Total texts = " + str(unique_titles - empty_titles))

# fake news = 23481

# unique texts = 23481
# of empty texts = 630
Total texts = 22851

# unique titles = 23481
# of empty titles = 0
Total texts = 23481


In [7]:
fake = fake.drop(columns=['subject','date'])
fake_trn, fake_val, fake_tst = train_validate_test_split(fake, train_percent=0.7, validate_percent=0.2, seed=1)

In [11]:
true = pd.read_csv('True.csv', delimiter = ',')
true['label']= 1

In [12]:
print("# true news = " + str(len(true)) + '\n')

unique_texts = true['text'].value_counts().sum()
empty_texts = true[true['text'] == ' '].append(true[true['text'] == '  ']).value_counts().sum()
print("# unique texts = " + str(unique_texts))
print("# of empty texts = " + str(empty_texts))
print("Total texts = " + str(unique_texts - empty_texts) + '\n')

unique_titles = true['title'].value_counts().sum()
empty_titles = true[true['title'] == ' '].append(true[true['title'] == '  ']).value_counts().sum()
print("# unique titles = " + str(unique_titles))
print("# of empty titles = " + str(empty_titles))
print("Total texts = " + str(unique_titles - empty_titles))

# true news = 21417

# unique texts = 21417
# of empty texts = 1
Total texts = 21416

# unique titles = 21417
# of empty titles = 0
Total texts = 21417


In [13]:
true = true.drop(columns=['subject','date'])
true_trn, true_val, true_tst = train_validate_test_split(true, train_percent=0.7, validate_percent=0.2, seed=1)

In [14]:
df_trn, df_val, df_tst = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

df_trn = true_trn.append(fake_trn).sample(frac=1).reset_index(drop=True)
df_val = true_val.append(fake_val).sample(frac=1).reset_index(drop=True)
df_tst = true_tst.append(fake_tst).sample(frac=1).reset_index(drop=True)

print("Training set: " + str(len(df_trn)))
print("Validation set: " + str(len(df_val)))
print("Test set: " + str(len(df_tst)))

Training set: 31427
Validation set: 8979
Test set: 4492


## Generate datasets with 'title'

In [24]:
### Save titles
df_trn.drop(columns=['text']).to_csv('./preprocessed_data/trn_title.csv', header=False, index=False)
df_val.drop(columns=['text']).to_csv('./preprocessed_data/val_title.csv', header=False, index=False)
df_tst.drop(columns=['text']).to_csv('./preprocessed_data/tst_title.csv', header=False, index=False)

## Generate dataset with 'text'

In [25]:
### Save texts
df_trn.drop(columns=['title']).to_csv('./preprocessed_data/trn_text.csv', header=False, index=False)
df_val.drop(columns=['title']).to_csv('./preprocessed_data/val_text.csv', header=False, index=False)
df_tst.drop(columns=['title']).to_csv('./preprocessed_data/tst_text.csv', header=False, index=False)

# Download preprocessed data

In [30]:
## Download pre-processed data
gdd.download_file_from_google_drive(file_id='1PUiB33hgTsefasb3D2t920Gu2mINLd4C', dest_path='./preprocessed_data/preprocessed_data.zip', unzip=True)
!rm ./preprocessed_data/preprocessed_data.zip

Downloading 1PUiB33hgTsefasb3D2t920Gu2mINLd4C into ./preprocessed_data/preprocessed_data.zip... Done.
Unzipping...Done.
