In [1]:
import os
import json
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
# Set constants

PATH = "data/"
DATA_PATH="crawler_data/data/"
DOMAINS = {
#     "cnn.com": 0,
    "nytimes.com": 0,
    "washingtonpost.com": 0,
    "huffpost.com": 0,
    "vox.com": 0,
    "wsj.com": 1,
    "foxnews.com": 1,
    "washingtontimes.com": 1,
    "breitbart.com": 1,
    "nypost.com": 1
}
NUM_ARTICLES_PER_DOMAIN = 500

In [3]:
def get_label(source_domain):
    for domain in DOMAINS:
        if domain == source_domain:
            return DOMAINS[domain]
    return None

In [4]:
articles = []

def walk_through_dir(dir):
    for root, dirs, files in os.walk(dir):
        files = (file_path for file_path in files if file_path.endswith(".json"))
        for file_path in files:
            with open(root + "/" + file_path) as file:
                data = json.load(file)
                if data["maintext"] and data["source_domain"]:
                    label = get_label(data["source_domain"])
                    if label is not None:
                        articles.append({"Article_Title": data["title"],
                                         "Article_Text": data["maintext"],
                                         "Publish_Date": data["date_publish"],
                                         "Source": data["source_domain"],
                                         "Language": data["language"],
                                         "Label": label})
                    else:
                        print("Could not get label for source_domain:", data["source_domain"])
                else:
                    print("Missing maintext or source_domain")
        for dir in dirs:
            walk_through_dir(dir)

walk_through_dir(DATA_PATH)
dataset = pd.DataFrame(articles)
print(Counter(dataset.loc[dataset["Language"] == "en"]["Source"]))
print(Counter(dataset.loc[dataset["Language"] == "en"]["Label"]))

Missing maintext or source_domain
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for so

Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get

Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get label for source_domain: cnn.com
Could not get

Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Missing maintext or source_domain
Counter({'washingtontimes.com': 2384, 'foxnews.com': 1051, 'nytimes.com': 596, 'huffpost.com': 588, 'breitbart.com': 588, 'washingtonpost.com': 552, 'vox.com': 542, 'nypost.com': 511})
Counter({1: 4534, 0: 2278})


In [5]:
filtered_dataset = pd.DataFrame(columns = ["Article_Title", "Article_Text", "Source", "Label"])

for source in dataset["Source"].unique():
    articles = dataset.loc[(dataset["Source"] == source) & (dataset["Language"] == "en")]
    articles = articles.sample(frac=1)
    articles = articles[:NUM_ARTICLES_PER_DOMAIN]
    filtered_dataset = filtered_dataset.append(articles)
    
print(Counter(filtered_dataset["Source"]))
print(Counter(filtered_dataset["Label"]))

Counter({'vox.com': 500, 'nypost.com': 500, 'huffpost.com': 500, 'washingtontimes.com': 500, 'washingtonpost.com': 500, 'foxnews.com': 500, 'nytimes.com': 500, 'breitbart.com': 500})
Counter({0: 2000, 1: 2000})


In [6]:
# Save dataset to disk

print("Saving dataset to disk")
Path(PATH).mkdir(parents=True, exist_ok=True)
dataset.to_csv(PATH + "dataset.csv", index=False)
print("Done")

Saving dataset to disk
Done
