In [1]:
import os
import unicodedata
import pandas as pd

# import WikiNewsNetwork as wnn

In [2]:
import calendar
import datetime
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

# %% Basic query functions


def chunks(li, n):
    """
    Split list li into list of lists of length n.
    Parameters
    ----------
    li : list
        Initial list.
    n : int
        Desired sublist size.
    Yields
    ------
    list
        Subsequent sublists of length n.
    """
    # For item i in a range that is a length of li,
    for i in range(0, len(li), n):
        # Create an index range for li of n items:
        yield li[i:i+n]


def query(request):
    """
    Query Wikipedia API with specified parameters.
    Parameters
    ----------
    request : dict
        API call parameters.
    Raises
    ------
    ValueError
        Raises error if returned by API.
    Yields
    ------
    dict
        Subsequent dicts of json API response.
    """
    request['action'] = 'query'
    request['format'] = 'json'
    lastContinue = {}
    while True:
        # Clone original request
        req = request.copy()
        # Modify with values from the 'continue' section of the last result.
        req.update(lastContinue)
        # Call API
        result = requests.get(
            'https://en.wikipedia.org/w/api.php', params=req).json()
        if 'error' in result:
            print('ERROR')
            # print(result['error'])
            raise ValueError(result['error'])
        if 'warnings' in result:
            print(result['warnings'])
        if 'query' in result:
            yield result['query']
        if 'continue' not in result:
            break
        lastContinue = result['continue']

In [3]:
def parse(request):
    """
    Query Wikipedia API with specified parameters to parse data.
    Parameters
    ----------
    request : dict
        API call parameters.
    Raises
    ------
    ValueError
        Raises error if returned by API.
    Yields
    ------
    dict
        Subsequent dicts of json API response.
    """
    request['action'] = 'parse'
    request['format'] = 'json'
    lastContinue = {}
    while True:
        # Clone original request
        req = request.copy()
        # Modify with values from the 'continue' section of the last result.
        req.update(lastContinue)
        # Call API
        result = requests.get(
            'https://en.wikipedia.org/w/api.php', params=req).json()
        if 'error' in result:
            print('ERROR')
            # print(result['error'])
            raise ValueError(result['error'])
        if 'warnings' in result:
            print(result['warnings'])
        if 'parse' in result:
            yield result['parse']
        if 'continue' not in result:
            break
        lastContinue = result['continue']


def wiki_news_articles(months):
    """
    Generate dataframe of events from the Wikipedia Current Events Portal.
    Parameters
    ----------
    months : iterable
        List of months to get events for.
    Returns
    -------
    DataFrame
        Events DF with Date, Category, Text, Articles, Ext links, HTML columns
    """
    stories_df = pd.DataFrame(columns=['Date', 'Category', 'Text', 'Title link', 'Articles',
                                       'Ext links', 'HTML'])  # create df

    for m in months:
        print(m)
        params = {'page': 'Portal:Current_events/%s' % m, 'prop': 'text'}
        # Get data from current events page for that month
        scr = list(parse(params))

        doc = bs(scr[0]['text']['*'], 'html.parser')  # BeautifulSoup the html

        # tag for each day
        # ##### NOTE THAT THIS CHANGES BASED ON MONTH
        # Previously "table", {"class" : "vevent"}
        days = doc.findAll("div", {"class": "vevent"})
        for d in days:
            day = d['id']  # date
            # get categories in list
            # cats = d.findAll("div", {'role': 'heading',
            #                          "style": "margin-top:0.3em; font-size:inherit; font-weight:bold;"})
            cats = d.findAll("p")#("div", {'role': 'heading',"class":"current-events-content-heading"})
            for c in cats:
                cat_name = c.text
                # print("Here", cat_name)
                # get all stories in cat (note some repeats)
                try:
                  stories = c.next_sibling.next_sibling.findAll("li")
                except:
                  print("Exception", day, c)
                  continue
                for s in stories:
                    txt = ''
                    new_txt = s.text.strip()  # get text description
                    if new_txt in txt:  # if matches previous, skip
                        continue
                    txt = new_txt
                    html = s
                    links = s.findAll("a")  # get links in description
                    try:
                      title_link = s.find('a', href=True)['href']
                    except:
                      title_link = ''
                    articles = []
                    ext = []
                    for li in links:
                        try:
                            # add wiki links to list//
                            articles.append(li['title'])
                        except KeyError:
                            # add ext links to another list
                            ext.append(li['href'])

                    # add event data to df
                    stories_df = stories_df.append({'Date': day,
                                                    'Category': cat_name,
                                                    'Text': txt,
                                                    'Title link': title_link,
                                                    'Articles': articles,
                                                    'Ext links': ext,
                                                    'HTML': str(html)},
                                                   ignore_index=True)

    return stories_df

In [4]:
sdf = wiki_news_articles([x.strftime('%B_%Y') for x in
                                   pd.date_range('20220101', '20221230',
                                                 freq='m')])

Output hidden; open in https://colab.research.google.com to view.

In [5]:
sdf.head()

Unnamed: 0,Date,Category,Text,Title link,Articles,Ext links,HTML
0,2022_January_1,Armed conflicts and attacks\n,Haitian Prime Minister Ariel Henry survives an...,/wiki/Prime_Minister_of_Haiti,"[Prime Minister of Haiti, Ariel Henry, Gonaïves]",[https://www.bbc.com/news/world-latin-america-...,"<li><a href=""/wiki/Prime_Minister_of_Haiti"" ti..."
1,2022_January_1,Disasters and accidents\n,2021–2022 Boulder County fires\nAt least three...,/wiki/2021%E2%80%932022_Boulder_County_fires,"[2021–2022 Boulder County fires, Boulder Count...",[https://www.cnn.com/2022/01/01/us/colorado-wi...,"<li><a class=""mw-redirect"" href=""/wiki/2021%E2..."
2,2022_January_1,Disasters and accidents\n,"At least three people are confirmed missing, a...","/wiki/Boulder_County,_Colorado","[Boulder County, Colorado, United States]",[https://www.cnn.com/2022/01/01/us/colorado-wi...,<li>At least three people are confirmed missin...
3,2022_January_1,Disasters and accidents\n,Vaishno Devi Temple stampede\nTwelve people ar...,/wiki/Vaishno_Devi_Temple_stampede,"[Vaishno Devi Temple stampede, Stampede, Vaish...",[https://www.aljazeera.com/news/2022/1/1/12-de...,"<li><a href=""/wiki/Vaishno_Devi_Temple_stamped..."
4,2022_January_1,Disasters and accidents\n,Twelve people are killed and 13 others injured...,/wiki/Stampede#Human_stampedes,"[Stampede, Vaishno Devi Temple, India, Jammu a...",[https://www.aljazeera.com/news/2022/1/1/12-de...,<li>Twelve people are killed and 13 others inj...


In [7]:

from google.colab import files
sdf.to_csv('events_en.csv') 
files.download('events_en.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>