In [1]:
%load_ext autoreload
%autoreload 2

import uuid
import newspaper
from datetime import datetime
import pandas as pd
import traceback

In [13]:
def scrape_urls(list_articles):
    """
    Utiliza los webpages para luego scraper y asignar una categoria a la noticia
    Args:
        webpage_urls (list): Urls a las cuales se le quieren hacer los scraping
        category (str): nombre de la categoria
    Returns:
        df: DataFrame con las noticas no duplicadadas
    """    
    scraped_info = {}

    for i,article in enumerate(list_articles):
        try:
            print(i)
            print('-'*50)
            print(article.url)
            article.download()
            article.parse()
            if article.publish_date == None:
                news_date = datetime.strftime(datetime.now(), "%Y-%m-%d")
            else:
                news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")

            print(news_date)
            scraped_info[i] = {'news_id': uuid.uuid4(),
                            'news_url_absolute': article.url,
                            'news_init_date' : news_date,
                            'news_final_date' : news_date,
                            'news_title':article.title,
                            'news_text_content' : article.text,
                                }
        except:
            print(traceback.print_exc())
    df = pd.DataFrame.from_records(scraped_info).T
    scraped_deduped = df.drop_duplicates(subset=['news_url_absolute'])
    print(f'Amount of news scraped: {len(scraped_deduped)}')
    return scraped_deduped

In [2]:
test = newspaper.build("https://www.gitanos.org/actualidad/prensa/comunicados/", memoize_articles=False)

for article in test.articles:
    print(article.url)

https://www.gitanos.org/campannas/tus_prejuicios_son_las_voces_de_otros.html
http://windows.microsoft.com/es-ES/windows7/How-to-manage-cookies-in-Internet-Explorer-9
http://support.mozilla.org/es/kb/cookies-informacion-que-los-sitios-web-guardan-en-?redirectlocale=en-US&redirectslug=Cookies
http://www.gitanos.org/campannas/tus_prejuicios_son_las_voces_de_otros.html
https://www.gitanos.org/_estatal/_madrid__sede_central_


In [3]:
unionromani = newspaper.build('https://unionromani.org', memoize_articles=False)
df = scrape_urls(unionromani.articles)

0
--------------------------------------------------
https://unionromani.org/que-es-la-union-del-pueblo-romani/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
1
--------------------------------------------------
https://unionromani.org/2023/03/22/instituto-romano-estrena-la-v-edicion-de-sus-cursos-sobre-poblacion-gitana-en-exclusion-en-zaragoza-salamanca-y-valladolid/
2023-03-22
2
--------------------------------------------------
https://unionromani.org/2023/03/20/en-busca-del-mejor-relato-sobre-la-mujer-gitana/
2023-03-20
3
--------------------------------------------------
https://unionromani.org/2023/03/20/lucia-y-daniel-protagonistas-de-una-boda-gitana-con-casi-1-000-invitados/
2023-03-20
4
--------------------------------------------------
https://unionromani.org/2023/03/20/los-xv-premios-cofrade-ciudad-de-marbella-2023-reconocen-al-pianista-y-compositor-gitano-dorantes-como-musico-del-ano/
2023-03-20
5
--------------------------------------------------
https://unionromani.org/2023/03/20/el-muro-del-racismo-contra-la-discriminacion-racial-y-etnica/
2023-03-20
6
--------------------------------------------------
https://unionromani

Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
46
--------------------------------------------------
https://unionromani.org/2023/02/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
47
--------------------------------------------------
https://unionromani.org/2023/01/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
48
--------------------------------------------------
https://unionromani.org/2022/12/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
49
--------------------------------------------------
https://unionromani.org/2022/11/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
50
--------------------------------------------------
https://unionromani.org/2022/10/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
51
--------------------------------------------------
https://unionromani.org/2022/09/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
52
--------------------------------------------------
https://unionromani.org/2022/08/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
53
--------------------------------------------------
https://unionromani.org/2022/07/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
54
--------------------------------------------------
https://unionromani.org/2022/06/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
55
--------------------------------------------------
https://unionromani.org/2022/05/


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


None
56
--------------------------------------------------
https://unionromani.org/2022/04/
None
Amount of news scraped: 44


Traceback (most recent call last):
  File "C:\Users\froro\AppData\Local\Temp\ipykernel_20048\2458422773.py", line 20, in scrape_urls
    news_date = datetime.strftime(article.publish_date, "%Y-%m-%d")
TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'NoneType' object


In [4]:
df.to_csv('unionromani.csv', index=False)

In [11]:
scrapie = [ 
           "https://www.gitanos.org/informeanual/educacion/", 
           "https://www.gitanos.org/informeanual/quienes-somos/",
           "https://www.gitanos.org/informeanual/empleo/",
           "https://www.gitanos.org/informeanual/pobreza-y-exclusion/",
           "https://www.gitanos.org/informeanual/incidencia/",
           "https://www.gitanos.org/informeanual/internacional/",
           "https://www.gitanos.org/informeanual/sensibilizacion/",
           "https://www.gitanos.org/informeanual/igualdad/", 
           "https://informesdiscriminacion.gitanos.org/"]
gitanos_2 = newspaper.build('https://informesdiscriminacion.gitanos.org/difusion', memoize_articles=False)
df = scrape_urls(gitanos_2.articles)
df.to_csv('gitanos_2.csv', index=False)

0
--------------------------------------------------
https://informesdiscriminacion.gitanos.org/difusion/el-sesgo-discriminatorio-en-el-uso-de-la-inteligencia-artificial-tema-central-del-informe
2023-03-23
1
--------------------------------------------------
https://informesdiscriminacion.gitanos.org/difusion/secretariado-gitano-denuncia-el-uso-de-la-pandemia-como-pretexto-para-el-aumento-de-los
2023-03-23
2
--------------------------------------------------
https://informesdiscriminacion.gitanos.org/difusion/el-impacto-de-la-crisis-de-la-covid-19-en-el-racismo-y-el-antigitanismo-protagoniza-el-
2023-03-23
3
--------------------------------------------------
https://informesdiscriminacion.gitanos.org/difusion/la-fundaci%C3%B3n-secretariado-gitano-presenta-su-informe-%E2%80%9Cdiscriminaci%C3%B3n-y-comunidad-gitana
2023-03-23
4
--------------------------------------------------
https://informesdiscriminacion.gitanos.org/difusion/fundaci%C3%B3n-secretariado-gitano-atendi%C3%B3-en-2019-m%C

In [2]:
a = newspaper.Article("https://informesdiscriminacion.gitanos.org/casos/5-ucrania-grupos-de-extrema-derecha-atacan-comunidades-gitanas-tras-un-conflicto-entre-dos")

In [3]:
a.download()

In [7]:
a.parse()

In [8]:
 a.text

'Alrededor de 50 miembros de grupos de extrema derecha ucranianos (incluidos C-14, Right Sector, Svoboda y el Cuerpo Nacional) intentaron atacar casas pertenecientes a personas romaníes el domingo 17 de octubre de 2021. Los manifestantes lanzaron bengalas y bombas de humo y corearon “muerte a los traficantes de drogas” mientras los policías formaban una línea de protección para evitar que entraran al barrio de mayoría romaní en Irpin, una ciudad a unos 20 kilómetros de Kiev.\n\nLa acción se organizó en respuesta a un conflicto entre dos adolescentes romaníes, de 16 y 17 años, que presuntamente atacaron a un soldado ucraniano de 22 años la noche del 15 de octubre 2021. Según Irina Pyanishnikova, portavoz de la Policía Nacional en la región de Kiev: “hubo un caso en el que dos jóvenes gitanos causaron daños corporales leves a uno de los habitantes de Irpin. Hemos abierto una investigación penal”.\n\nLa protesta se prolongó hasta la noche. Los intentos de asaltar hogares romaníes fueron e