# Clickbait Resolving Challenge - Prepare Data

Due to legal reasons, we cannot provide the full texts for download. Hence please either use this notebook to download the full texts on your own or request the files via email.

In [None]:
!pip install newspaper3k

In [None]:
import json
from newspaper import Article, Config
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool

In [None]:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
config.memoize_articles = False
config.fetch_images = False
config.language = 'en'
config.thread_timeout_seconds=2

In [None]:
def getTxt(com):
    (id, url) = com
    print(f"{id}: Starting Download - {url}")
    article = Article(url)
    
    # Try to download and parse the text (up to three times)
    for i in range(3):
        try:
            article.download()
            article.parse()
            txt=article.text
            print(f"{id}: Downloaded & parsed")
            return id, url,txt
        except:
            print(f"{id}: Error while downloading (Attempt {i})")
    return id, url, None

In [None]:
SETS = ['train', 'dev']

In [None]:
PARALLEL_THREADS = 5

In [None]:
for set_name in SETS:
    with open(f"{set_name}.json", "r") as url_file:
        entries_without_text = json.load(url_file)

    print(f"Loaded {len(entries_without_text)} raw {set_name} entries")

    urls = [(e["id"], e["url"]) for e in entries_without_text]
    entries_dict = {e["id"]: e for e in entries_without_text}
    
    pool = ThreadPool(PARALLEL_THREADS)
    results = pool.map(getTxt, urls)
    pool.close() 
    pool.join()
    
    final_entries = []
    for r in results:
        if r[2] is not None:
            e = entries_dict[r[0]]
            e['text'] = r[2]
            final_entries.append(e)

    print(f"Added fulltext to {len(final_entries)} {set_name} entries")
    
    with open(f"final_{set_name}.json", "w") as outfile:
        json.dump(final_entries, outfile, ensure_ascii=False, indent=4)

## Create silver data

Gold data needs to be present

In [None]:
import json

In [None]:
SILVER_FILE = "train.2.wordnet.json"

In [None]:
with open(f"final_train.json", "r") as goldfile:
    gold_entries = json.load(goldfile)
gold_entries_dict = {e["id"]: e for e in gold_entries}

In [None]:
with open(SILVER_FILE, "r") as silverfile:
    silver_entries = json.load(silverfile)
print(f"Loaded {len(silver_entries)} silver entries")

In [None]:
final_silver_entries = []

for se in silver_entries:
    if se["id"] in gold_entries_dict:
        se["text"] = gold_entries_dict[se["id"]]["text"]
        final_silver_entries.append(se)

print(f"Added fulltexts to {len(final_silver_entries)} silver entries")

In [None]:
with open(f"final_{SILVER_FILE}", "w") as outfile:
    json.dump(final_silver_entries, outfile, ensure_ascii=False, indent=4)