In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

### Processing Pages

In [2]:
url = "https://en.wikipedia.org/w/index.php?title=Talk:Inflation&action=history&offset=&limit=1000"
page = requests.get(url)

Selecting all rows

In [3]:
soup = BeautifulSoup(page.text, 'html.parser')
revTags = soup.findAll("li", {"data-mw-revid" : True})

A function for getting the date, size (in bytes), and score of each edit:

In [4]:
def decodeLi(tag, topic = None, category= None):
    """
    (bs4 tag, str) -> list of [str: topic, str: category, str: date, int: size_of_page, int: size_of_edit, int:abs_size_of_edit]
    """
    date = tag.select("[class*=mw-changeslist-date]")[0].get_text()
    date = date.split(',')
    date = date[1] + ' ' + date[0] #Reordering for JS date parsing

    size = tag.findChild("span", {"class" : "history-size"})["data-mw-bytes"]
    edit_size = tag.select("[class^=mw-plusminus]")[0].get_text().strip("+")
    abs_edit_size = edit_size.strip('−')
    return [topic, category, date, size, edit_size, abs_edit_size]

In [5]:
decodeLi(revTags[100], "foo")

['foo', None, ' 16 July 2021 22:57', '8716', '4', '4']

A function to process all the edits in a bs4 object of the revision history page:

In [6]:
def getEdits(pgSoup, topic=None, category=None):
    """
    (bs4 obj) -> list of lists of form [str, int, int]
    """
    revTags = pgSoup.findAll("li", {"data-mw-revid" : True})
    edits = []
    for tag in revTags:
        edits.append(decodeLi(tag, topic, category))
    return edits

### Fetching Edits for Different Pages

In [7]:
topics = [{'Name' : "Inflation",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Talk:Inflation&action=history&offset=&limit=1000",
        'Category': 'Economic Concepts'},
    {'Name' : "Monetary Policy",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Talk:Monetary_policy&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "Economic Growth",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Economic_growth&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "Karl Marx",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Talk:Karl_Marx&action=history&offset=&limit=1000",
        'Category': 'Person'},
    {'Name' : "Adam Smith",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Talk:Adam_Smith&action=history&offset=&limit=1000",
        'Category': 'Person'},
    {'Name' : "John Maynard Keynes",
    'URL' : "https://en.wikipedia.org/w/index.php?title=John_Maynard_Keynes&action=history&offset=&limit=1000",
        'Category': 'Person'},
    {'Name' : "Milton Friedman",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Milton_Friedman&action=history&offset=&limit=1000",
        'Category': 'Person'},
    {'Name' : "Human Development Index",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Human_Development_Index&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "Gross Domestic Product",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Gross_domestic_product&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "Bitcoin",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Bitcoin&action=history&offset=&limit=1000",
        'Category': 'Currency'},
    {'Name' : "United States Dollar",
        'URL' : "https://en.wikipedia.org/w/index.php?title=United_States_dollar&action=history&offset=&limit=1000",
        'Category': 'Currency'},
    {'Name' : "Euro",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Euro&action=history&offset=&limit=1000",
        'Category': 'Currency'},
    {'Name' : "COVID-19 Recession",
        'URL' : "https://en.wikipedia.org/wiki/Talk:COVID-19_recession&offset=&limit=1000",
        'Category': 'Event'},
    {'Name' : "The Great Recession",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Talk:Great_Recession&action=history&offset=&limit=1000",
        'Category': 'Event'},
    {'Name' : "Industrial Revolution",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Industrial_Revolution&action=history&offset=&limit=1000",
        'Category': 'Event'},
    {'Name' : "The Wall Street Crash",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Wall_Street_Crash_of_1929&action=history&offset=&limit=1000",
        'Category': 'Event'}
    ]

topics = [
    {'Name' : "Economic Growth",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Economic_growth&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "Human Development Index",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Human_Development_Index&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "Gross Domestic Product",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Gross_domestic_product&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "COVID-19 Recession",
        'URL' : "https://en.wikipedia.org/wiki/Talk:COVID-19_recession&offset=&limit=1000",
        'Category': 'Event'},
    {'Name' : "Industrial Revolution",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Industrial_Revolution&action=history&offset=&limit=1000",
        'Category': 'Event'},
    {'Name' : "The Wall Street Crash",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Wall_Street_Crash_of_1929&action=history&offset=&limit=1000",
        'Category': 'Event'},
    {'Name' : "The Great Recession",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Talk:Great_Recession&action=history&offset=&limit=1000",
        'Category': 'Event'},
    {'Name' : "Inflation",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Talk:Inflation&action=history&offset=&limit=1000",
        'Category': 'Economic Concept'},
    {'Name' : "Bitcoin",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Bitcoin&action=history&offset=&limit=1000",
        'Category': 'Currency'},
    {'Name' : "United States Dollar",
        'URL' : "https://en.wikipedia.org/w/index.php?title=United_States_dollar&action=history&offset=&limit=1000",
        'Category': 'Currency'},
    {'Name' : "Euro",
        'URL' : "https://en.wikipedia.org/w/index.php?title=Euro&action=history&offset=&limit=1000",
        'Category': 'Currency'}
    ]


In [15]:
download = True
if download:
    data = []
    for topic in topics:
        page = requests.get(topic['URL'])
        soup = BeautifulSoup(page.text, 'html.parser')
        data = data + getEdits(soup, topic["Name"],topic["Category"] )


In [16]:
df = pd.DataFrame(data)
df.columns = ['Page', 'Category', 'Date', 'Size', 'Edit Size','Abs Edit Size']

In [2]:
df = pd.read_csv('wiki_edits.csv')

In [4]:
df = df.iloc[:, 1:]

In [8]:
df = df.melt(id_vars=['Date'])

In [26]:
df.sort_values(by=['Category'])
df = df.drop('Edit Size', axis=1)
df

Unnamed: 0,Page,Category,Date,Size,Abs Edit Size
0,Economic Growth,Economic Concept,21 October 2022 02:31,125712,265
1,Economic Growth,Economic Concept,20 October 2022 11:25,125447,11
2,Economic Growth,Economic Concept,17 October 2022 15:23,125436,697
3,Economic Growth,Economic Concept,17 October 2022 14:40,124739,761
4,Economic Growth,Economic Concept,17 October 2022 13:47,125500,761
...,...,...,...,...,...
9995,Euro,Currency,31 March 2018 17:38,83186,18
9996,Euro,Currency,29 March 2018 00:04,83168,164
9997,Euro,Currency,28 March 2018 23:51,83004,84
9998,Euro,Currency,28 March 2018 13:35,82920,41


In [9]:
df.to_csv('wiki_edits23.csv')