In [10]:
with open("sources.txt") as f:
    # Parse source file
    sources = {}
    
    current_key = None
    for line in f:
        data = line.strip().split(" ")
        if len(data) == 1:
            current_key = data[0]
            continue
        sources[current_key] = sources.get(current_key, []) + [{"name": " ".join(data[:-5]),"url": data[-5], "country": data[-4], "articles": data[-3:]}]
        
from newspaper import Article
from newsplease import NewsPlease

def article_parse(url):
    article = Article(url)
    article.download()
    article.parse()
    return article

In [2]:
#Source, Country, URL, NP3K_time, n-p time
def run_experiment():
    with open("article_results.csv", "w") as of:
        with open("errors.txt", "w") as err_f:
            of.write("Language,Source,Country,URL,Newspaper3k time,news-please time\n")
            for lang in sources.keys():
                for source in sources[lang]:
                    print(source["name"])
                    #Initialise time for source
                    np3k_time = 0.0
                    newsplease_time = 0.0
                    np3k_errors = 0
                    newsplease_errors = 0

                    for url in source["articles"]:
                        #Time newspaper3k and news-please article extraction
                        try:
                            current_time = %timeit -o -r3 article_parse(url)
                            np3k_time = np3k_time + current_time.average
                        except Exception as e:
                            err_f.write("Newspaper3k " + url + " " + str(e) + "\n")
                            np3k_errors = np3k_errors + 1
                        try:
                            current_time = %timeit -o -r3 article = NewsPlease.from_url(url)
                            newsplease_time = newsplease_time + current_time.average
                        except Exception as e:
                            err_f.write("news-please " + url + " " + str(e) + "\n")
                            newsplease_errors = newsplease_errors + 1

                    #Average out time per article (put -1 if it failed)
                    try:
                        np3k_time = np3k_time / (len(source["articles"]) - np3k_errors)
                    except ZeroDivisionError:
                        np3k_time = -1.0
                    try:
                        newsplease_time = (newsplease_time / len(source["articles"]) - newsplease_errors)
                    except ZeroDivisionError:
                        newsplease_time = -1.0
                    #Write line to CSV
                    of.write(lang + "," + source["name"] + "," + source["country"] + "," + source["url"] + "," + "{:.2f}".format(np3k_time*1_000) + "," + "{:.2f}".format(newsplease_time*1_000) + "\n")
    print("Wrote results into article_results.csv")

In [3]:
#Aggregate results into table based on language
def aggregate_results():
    aggregated = {}
    with open("article_results.csv") as f:
        with open("aggregated_results.csv","w") as of:
            of.write("Language,N,NP3k Correct,np correct,NP3k time,np time\n")
            first_line = True
            for line in f:
                #Remove first line
                if first_line:
                    first_line = False
                    continue

                #Parse data
                data = line.strip().split(",")
                aggregated[data[0]] = aggregated.get(data[0], {})
                lang = aggregated[data[0]]
                #Initialise dictionary values
                lang["NP3K_correct"] = lang.get("NP3K_correct", 0)
                lang["np_correct"] = lang.get("np_correct", 0)
                lang["NP3K_time"] = lang.get("NP3K_time", 0.0)
                lang["np_time"] = lang.get("np_time", 0.0)
                lang["N"] = lang.get("N", 0) + 1
                #Add to correct if correct (nonnegative time)
                if(float(data[4]) >= 0):
                    lang["NP3K_correct"] += 1
                    lang["NP3K_time"] += float(data[4])
                if(float(data[5]) >= 0):
                    lang["np_correct"] += 1
                    lang["np_time"] += float(data[5])  
            #Add data per language to new file
            total_n = 0
            total_np3k = 0
            total_np = 0
            total_np3k_time = 0.0
            total_np_time = 0.0

            for language in aggregated.keys():
                #Calculate average times for language
                average_np3k = aggregated[language]["NP3K_time"] / aggregated[language]["NP3K_correct"]
                average_np = aggregated[language]["np_time"] / aggregated[language]["np_correct"]
                #Write language row in file
                of.write(language+","+str(aggregated[language]["N"])+","+str(aggregated[language]["NP3K_correct"])+","+str(aggregated[language]["np_correct"])+","+str(average_np3k)+","+str(average_np)+"\n")
                #Calculate total statistics
                total_n += aggregated[language]["N"]
                total_np3k += aggregated[language]["NP3K_correct"]
                total_np += aggregated[language]["np_correct"]
                total_np3k_time = aggregated[language]["NP3K_time"]
                total_np_time = aggregated[language]["np_time"]

            #Do final row of table for total
            total_average_np3k = total_np3k_time / total_np3k
            total_average_np = total_np_time / total_np
            of.write("Total,"+str(total_n)+","+str(total_np3k)+","+str(total_np)+","+str(total_average_np3k)+","+str(total_average_np)+"\n")


In [4]:
from IPython.display import clear_output\

def measure_country(lang):
    with open("article_results_" + lang + ".csv", "w") as of:
        with open("errors_"+lang+".txt", "w") as err_f:
            for source in sources[lang]:
                print(source["name"])
                #Initialise time for source
                np3k_time = 0.0
                newsplease_time = 0.0
                np3k_errors = 0
                newsplease_errors = 0

                for url in source["articles"]:
                    #Time newspaper3k and news-please article extraction
                    try:
                        current_time = %timeit -o -r3 article_parse(url)
                        np3k_time = np3k_time + current_time.average
                    except Exception as e:
                        err_f.write("Newspaper3k " + url + " " + str(e) + "\n")
                        np3k_errors = np3k_errors + 1
                    try:
                        current_time = %timeit -o -r3 article = NewsPlease.from_url(url)
                        newsplease_time = newsplease_time + current_time.average
                    except Exception as e:
                        err_f.write("news-please " + url + " " + str(e) + "\n")
                        newsplease_errors = newsplease_errors + 1

                #Average out time per article (put -1 if it failed)
                try:
                    np3k_time = np3k_time / (len(source["articles"]) - np3k_errors)
                except ZeroDivisionError:
                    np3k_time = -1.0
                try:
                    newsplease_time = (newsplease_time / len(source["articles"]) - newsplease_errors)
                except ZeroDivisionError:
                    newsplease_time = -1.0
                #Write line to CSV
                of.write(lang + "," + source["name"] + "," + source["country"] + "," + source["url"] + "," + "{:.2f}".format(np3k_time*1_000) + "," + "{:.2f}".format(newsplease_time*1_000) + "\n")
                #Clear output
                clear_output(wait=True)
    print("Wrote results into article_results.csv")

In [None]:
measure_country("ENG")

ITV News
370 ms ± 9.29 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [5]:
def combine_countries():
    of.write("Language,Source,Country,URL,Newspaper3k time,news-please time\n")
    with open("article_results.csv", "w") as of:
        of.write()
        for key in sources.keys():
            with open("article_results_"+code+".csv", "w") as f:
                for line in f:
                    of.write(line)

In [19]:
test = NewsPlease.from_url(sources["ENG"][5]["articles"][2])
test.title

not a 200 response: 403


ArticleException: You must `download()` an article first!

In [12]:
test = article_parse(sources["ENG"][6]["articles"][0])
print(sources["ENG"][6]["articles"][0])
test.title

https://www.itv.com/news/2022-10-23/tory-mps-to-make-their-choice-for-leader-after-johnson-withdraws


'Sunak could be declared next PM within hours after Johnson abandons leadership bid'

In [14]:
with open("test_output.txt", "w", encoding="utf-8") as of:
    for lang in sources.keys():
        print(lang)
        for source in sources[lang]:
            print(source["name"])
            for article in source["articles"]:
                try:
                    test = article_parse(article)
                    if test:
                        title = test.title
                    else:
                        title = "Didn't parse"
                except:
                    title = "Error"
                of.write(source["name"] + " " + title + "\n")
print("Done")

ENG
AP News
Reuters
BBC News
Stat News
CNN
NY Times
ITV News
The Guardian
CBC News
Global News
Telegraph
ABC News
9News
Scoop
Stuff.co
RTE News
Jamaica Observer
Trinidad express
Loop Trinidad
The East African
Monitor
WHO
CDC
The BMJ
Reliefweb
New Scientist
Medical Independent
CMAJ News
FRA
Le Figaro
Le Monde
Liberation
L'Express
La Presse
Le Devoir
Le Nouvelliste
La presse.tn
Mediacongo
Congo Independent
Le Matin du Sahara er du Maghreb
ESP
El Universal
El Mundo
ABC
El Pais
La Nacion
El nacional
Pulso
La Jornada
El Tiempo
El Espectador
ZHO
Sina
Huanqiu
Beijing News
China Daily
Ming Pao
Liberty
China Times
Lianhe Zaobao
Health news network
RUS
TASS
RIA Novosti
Interfax
REGNUM
Komsomolskaya Pravda
New times
Nur.kz
DSNews
BELTA
POR
Publico
DiÃ¡rio de NotÃ­cias
Jornal de NotÃ­cias
Folha De S.Paulo
Rede Globo Noticias
Terra
O Democrata
Radio Jovem
Expresso das Ilhas
Jornal Noticias
IND
Kompas
Republika
Jawa Pos
Pikiran Rakyat
Bali post
SWA
Habari Leo
MTanzania
IPP Media
Global Publishers
Tu