In [1]:
%load_ext autoreload
%autoreload 2

import uuid
import newspaper
from datetime import datetime
import pandas as pd
import traceback

In [2]:
def scrape_urls(list_articles):
    """
    Utiliza los webpages para luego scraper y asignar una categoria a la noticia
    Args:
        webpage_urls (list): Urls a las cuales se le quieren hacer los scraping
        category (str): nombre de la categoria
    Returns:
        df: DataFrame con las noticas no duplicadadas
    """    
    scraped_info = {}

    for i,article in enumerate(list_articles):
        try:
            print(i)
            print('-'*50)
            print(article.url)
            article.download()
            article.parse()
            
            news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")

            print(news_date)
            scraped_info[i] = {'news_id': uuid.uuid4(),
                            'news_url_absolute': article.url,
                            'news_init_date' : news_date,
                            'news_final_date' : news_date,
                            'news_title':article.title,
                            'news_text_content' : article.text,
                                }
        except:
            print(traceback.print_exc())
    df = pd.DataFrame.from_records(scraped_info).T
    scraped_deduped = df.drop_duplicates(subset=['news_url_absolute'])
    print(f'Amount of news scraped: {len(scraped_deduped)}')
    return scraped_deduped

In [3]:
unionromani = newspaper.build('https://unionromani.org', memoize_articles=False)
df = scrape_urls(unionromani.articles)

0
--------------------------------------------------
https://unionromani.org/que-es-la-union-del-pueblo-romani/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
1
--------------------------------------------------
https://unionromani.org/2023/03/22/instituto-romano-estrena-la-v-edicion-de-sus-cursos-sobre-poblacion-gitana-en-exclusion-en-zaragoza-salamanca-y-valladolid/
2023-03-22
2
--------------------------------------------------
https://unionromani.org/2023/03/20/en-busca-del-mejor-relato-sobre-la-mujer-gitana/
2023-03-20
3
--------------------------------------------------
https://unionromani.org/2023/03/20/lucia-y-daniel-protagonistas-de-una-boda-gitana-con-casi-1-000-invitados/
2023-03-20
4
--------------------------------------------------
https://unionromani.org/2023/03/20/los-xv-premios-cofrade-ciudad-de-marbella-2023-reconocen-al-pianista-y-compositor-gitano-dorantes-como-musico-del-ano/
2023-03-20
5
--------------------------------------------------
https://unionromani.org/2023/03/20/el-muro-del-racismo-contra-la-discriminacion-racial-y-etnica/
2023-03-20
6
--------------------------------------------------
https://unionromani

Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
46
--------------------------------------------------
https://unionromani.org/2023/02/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
47
--------------------------------------------------
https://unionromani.org/2023/01/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
48
--------------------------------------------------
https://unionromani.org/2022/12/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
49
--------------------------------------------------
https://unionromani.org/2022/11/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
50
--------------------------------------------------
https://unionromani.org/2022/10/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
51
--------------------------------------------------
https://unionromani.org/2022/09/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
52
--------------------------------------------------
https://unionromani.org/2022/08/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
53
--------------------------------------------------
https://unionromani.org/2022/07/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
54
--------------------------------------------------
https://unionromani.org/2022/06/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
55
--------------------------------------------------
https://unionromani.org/2022/05/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
56
--------------------------------------------------
https://unionromani.org/2022/04/
None
Amount of news scraped: 44


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


In [4]:
df.to_csv('unionromani.csv', index=False)