In [1]:
import pandas as pd
import requests

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_pickle("data/20240813.gkg.first-1000-news.pkl")

In [41]:
gkg = pd.read_pickle("data/20240813.gkg.pkl")

In [42]:
list_urls = gkg["first_url"].head(10000).to_list()

In [43]:
len(list_urls)

10000

In [7]:
from newspaper import Article
from newspaper import Config

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
def get_article(url):
    try:
        article = Article(url, config=config)
        article.download()
        article.parse()
        article.nlp()
        data = {
            "title": article.title,
            "text": article.text,
            "summary": article.summary
        }
        return data
    except Exception as e:
        return {
            "title": "ERROR",
            "text": str(e),
            "summary": "ERROR"
        }

In [19]:
import time
def get_article(url):
    start_time = time.time()
    download_time = parse_time = nlp_time = None
    
    try:
        article = Article(url, config=config)
        
        start_download = time.time()
        article.download()
        download_time = time.time() - start_download
        
        start_parse = time.time()
        article.parse()
        parse_time = time.time() - start_parse
        
        start_nlp = time.time()
        article.nlp()
        nlp_time = time.time() - start_nlp
        
        data = {
            "title": article.title,
            "text": article.text,
            "summary": article.summary,
        }
    except Exception as e:
        data = {
            "title": "ERROR",
            "text": str(e),
            "summary": "ERROR",
        }
    finally:
        total_time = time.time() - start_time
        data = {
            **data,
            "download": download_time,
            "parse": parse_time,
            "nlp": nlp_time,
            "total": total_time
        }
        return data

In [8]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def scrape_articles_parallel(urls, max_workers=10):
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {executor.submit(get_article, url): url for url in urls}
        for future in as_completed(future_to_url):
            results.append(future.result())
    return results

In [20]:
articles = scrape_articles_parallel(list_urls,10)

In [29]:
articles = scrape_articles_parallel(list_urls,20)



In [44]:

articles = scrape_articles_parallel(list_urls,40)



In [31]:
len(articles)

1000

In [18]:
articles[0]a

{'title': 'Underground reservoir on Mars could fill oceans on the planet’s surface, study finds',
 'text': '(CNN) — Data from a retired NASA mission has revealed evidence of an underground reservoir of water deep beneath the surface of Mars, according to new research.\n\nA team of scientists estimates that there may be enough water, trapped in tiny cracks and pores of rock in the middle of the Martian crust, to fill oceans on the planet’s surface. The groundwater would likely cover the entirety of Mars to a depth of 1 mile (1.6 kilometers), the study found.\n\nThe data came from NASA’s InSight lander, which used a seismometer to study the interior of Mars from 2018 to 2022.\n\nFuture astronauts exploring Mars would encounter a whole host of challenges if they tried to access the water, because it’s located between 7 and 12 miles (11.5 and 20 kilometers) beneath the surface, according to the study published Monday in the journal Proceedings of the National Academy of Sciences.\n\nBut th

In [15]:
import multiprocessing

def get_core_count():
    return multiprocessing.cpu_count()

print(f"Number of CPU cores: {get_core_count()}")

Number of CPU cores: 10


In [33]:
pd.DataFrame(articles).to_csv("data/20240813.gkg.first-1000-news-articles.csv", index=False)

In [30]:
pd.DataFrame(articles).tail()

Unnamed: 0,title,text,summary,download,parse,nlp,total
995,Soldier charged for Kiss driver's road death o...,"Laurel V Williams\n\nJALANI Mapp, the man char...","Laurel V Williams JALANI Mapp, the man charged...",8.456538,0.210626,0.002457,8.66967
996,Bangladesh Sees 12-Year High Inflation Amid Pr...,Bangladesh's inflation in July reached a 12-ye...,Bangladesh's inflation in July reached a 12-ye...,4.836724,1.49997,0.001541,6.338301
997,ERROR,Article `download()` failed with HTTPConnectio...,ERROR,10.675692,,,10.675741
998,Your superb shots of the Northern Lights visib...,The Aurora Borealis was visible in parts of th...,The Aurora Borealis was visible in parts of th...,9.594916,0.327279,0.001642,9.923888
999,LQR House Inc. Announces 382% Year-Over-Year R...,"MIAMI BEACH, FL / ACCESSWIRE / August 13, 2024...","In July 2023, LQR House reported revenues of $...",3.938475,25.957049,0.003842,29.899412


In [46]:
ar = pd.DataFrame(articles)

ar.sort_values("total", ascending=False).head(10)

Unnamed: 0,title,text,summary,download,parse,nlp,total
9998,"NITDA, Japan partner to empower startups with ...",The National Information Technology Developmen...,The National Information Technology Developmen...,4.829097,206.816207,0.002367,211.647736
7976,SVB&T Corp.: SVB&T Corporation Reports 2024 Se...,"SVB&T Corporation (OTCQX:SVBT), parent company...","Interest income increased $890,000 compared to...",5.704282,111.365592,0.01344,117.083366
8071,Franco-Nevada Reports Q2 2024 Results; New Min...,New Mine Start-ups and Acquisitions\n\n(in U.S...,Q2 2024 Portfolio Updates Precious Metal asset...,6.259472,96.307791,0.036811,102.60413
9428,Ponsse Oyj : Half year Report for 1 January 30...,Ponsse's Half-year Report for 1 January - 30 J...,DISTRIBUTION NETWORK In the new operating mode...,7.137853,88.686141,0.036498,95.860548
5760,1843 Seated Dollar,"Author Replies: 12 / Views: 1,690\n\nValued Me...",Pillar of the Community United States 4049 Pos...,5.871923,75.527072,0.003525,81.402565
6408,More than 20 injured in German train crash,BERLIN - An east German regional train struck ...,BERLIN - An east German regional train struck ...,8.011999,68.95506,0.001326,76.968432
7918,Syndicated Article – Your Fort Dodge,,,4.41264,72.175424,0.000181,76.588318
2269,Defunct N.S. regional development agency to un...,HALIFAX - A forensic audit will determine what...,"Percy Paris, the economic and rural developmen...",8.450237,67.218402,0.002903,75.67161
6542,CTV Montreal: Stephane Giroux at the courthouse,You are now being redirected to the BCE.ca web...,You are now being redirected to the BCE.ca web...,9.847173,65.702806,0.005332,75.555381
3720,Young people see online slurs as jokes: AP-MTV...,WASHINGTON - Is it ever OK to tweet that a gir...,"Jaded by the Internet free-for-all, teens and ...",8.151193,66.477123,0.006284,74.634714
