In [1]:
with open("sources.txt") as f:
    # Parse source file
    sources = {}
    
    current_key = None
    for line in f:
        data = line.strip().split(" ")
        if len(data) == 1:
            current_key = data[0]
            continue
        sources[current_key] = sources.get(current_key, []) + [{"name": " ".join(data[:-5]),"url": data[-5], "country": data[-4], "articles": data[-3:]}]

In [3]:
from newspaper import Article
from newsplease import NewsPlease

def article_parse(url):
    article = Article(url)
    article.download()
    article.parse()

#Source, Country, URL, NP3K_time, n-p time
with open("article_results.csv", "w") as of:
    with open("errors.txt", "w") as err_f:
        of.write("Language,Source,Country,URL,Newspaper3k time,news-please time\n")
        for lang in ["KOR", "IND"]:
            for source in sources[lang][:2]:
                print(source["name"])
                #Initialise time for source
                np3k_time = 0.0
                newsplease_time = 0.0
                np3k_errors = 0
                newsplease_errors = 0
                
                for url in source["articles"]:
                    #Time newspaper3k and news-please article extraction
                    try:
                        current_time = %timeit -o -r3 article_parse(url)
                        np3k_time = np3k_time + current_time.average
                    except Exception as e:
                        err_f.write("Newspaper3k " + url + " " + str(e) + "\n")
                        np3k_errors = np3k_errors + 1
                    try:
                        current_time = %timeit -o -r3 article = NewsPlease.from_url(url)
                        newsplease_time = newsplease_time + current_time.average
                    except Exception as e:
                        err_f.write("news-please " + url + " " + str(e) + "\n")
                        newsplease_errors = newsplease_errors + 1

                #Average out time per article (put -1 if it failed)
                try:
                    np3k_time = np3k_time / (len(source["articles"]) - np3k_errors)
                except ZeroDivisionError:
                    np3k_time = -1.0
                try:
                    newsplease_time = (newsplease_time / len(source["articles"]) - newsplease_errors)
                except ZeroDivisionError:
                    newsplease_time = -1.0
                #Write line to CSV
                of.write(lang + "," + source["name"] + "," + source["country"] + "," + source["url"] + "," + "{:.2f}".format(np3k_time*1_000) + "," + "{:.2f}".format(newsplease_time*1_000) + "\n")
print("Wrote results into article_results.csv")

Dong-a Ilbo
1.35 s ± 8.97 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
1.41 s ± 9.2 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
1.36 s ± 18.6 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
1.62 s ± 111 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
1.32 s ± 15.8 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
1.39 s ± 4.37 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
Joongang Ilbo
2.24 s ± 14 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
2.41 s ± 18.7 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
2.24 s ± 8.15 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
2.49 s ± 143 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
2.53 s ± 27.3 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
2.48 s ± 126 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
Kompas
368 ms ± 21 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
581 ms ± 60 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
392 ms ± 10 ms per loop (

In [4]:
#Aggregate results into table based on language
aggregated = {}
with open("article_results.csv") as f:
    with open("aggregated_results.csv","w") as of:
        of.write("Language,N,NP3k Correct,np correct,NP3k time,np time\n")
        first_line = True
        for line in f:
            #Remove first line
            if first_line:
                first_line = False
                continue
    
            #Parse data
            data = line.strip().split(",")
            aggregated[data[0]] = aggregated.get(data[0], {})
            lang = aggregated[data[0]]
            #Initialise dictionary values
            lang["NP3K_correct"] = lang.get("NP3K_correct", 0)
            lang["np_correct"] = lang.get("np_correct", 0)
            lang["NP3K_time"] = lang.get("NP3K_time", 0.0)
            lang["np_time"] = lang.get("np_time", 0.0)
            lang["N"] = lang.get("N", 0) + 1
            #Add to correct if correct (nonnegative time)
            if(float(data[4]) >= 0):
                lang["NP3K_correct"] += 1
                lang["NP3K_time"] += float(data[4])
            if(float(data[5]) >= 0):
                lang["np_correct"] += 1
                lang["np_time"] += float(data[5])  
        #Add data per language to new file
        for language in aggregated.keys():
            average_np3k = aggregated[language]["NP3K_time"] / aggregated[language]["NP3K_correct"]
            average_np = aggregated[language]["np_time"] / aggregated[language]["np_correct"]
            of.write(language+","+str(aggregated[language]["N"])+","+str(aggregated[language]["NP3K_correct"])+","+str(aggregated[language]["np_correct"])+","+str(average_np3k)+","+str(average_np)+"\n")