# Mapping controversies script 7: Harvest revision timeline from Wikipedia pages

In Wikipedias __[revision history](https://en.wikipedia.org/w/index.php?title=Circumcision&action=history)__ you can study how users delete, append and alter content on the pages. Each alteration is logged with timestamp, user information and a complete track of the changes made.  
<img src="https://res.cloudinary.com/dra3btd6p/image/upload/v1549464567/Mapping%20controversies%202019/revision_history_cir.jpg" title="Category:circumcision" style="width: 800px;" /> 

We can reuse this information to indicate when a topic is disputed and by whom. 

In this tutorial we will harvest the revision timelines of the topics you are working on.  

The script will output two csv files. The first includes all revisions (one revision per row) and the other counts how many revisions are made per time-period.



## Step 1: Installing the right libraries
Libraries for Jupyter can be understood as preprogrammed script parts. This means, that instead of writing a lot of lines of code in order e.g. make contact to Wikipedia, you can do it in one command.


__Obs: in this workbook we will be using the requests library. If you have already installed it once, there is no need to do it again. You may simply skip to step 2.__

In [None]:
# In this cell Jupyter checks whether you have the right libraries installed 

import sys

try: #First, Jupyter tries to import a library
    import requests
    print("Requests library has been imported")
except: #If it fails, it will try to install the library
    print("Requests library not found. Installing...")
    !pip install requests
    try:#... and try to import it again
        import requests
    except: #unless it fails, and raises an error.
        print("Something went wrong in the installation of the requests library. Please check your internet connection and consult output from the installation below")

try: #First, Jupyter tries to import a library
    import geolite2
    print("geolite2 library has been imported")
except: #If it fails, it will try to install the library
    print("geolite2 library not found. Installing...")
    !pip install maxminddb-geolite2
    try:#... and try to import it again
        import geolite2
    except: #unless it fails, and raises an error.
        print("Something went wrong in the installation of the geolite2 library. Please check your internet connection and consult output from the installation below")
  

## Step 2: Harvest revision timeline from Wikipedia

The Wikipedia API for revisions hosts an array of different data points as you can see in the image below. For in this script, we only harvest timestamp, user, comment, slotsize, userid, ids and tags. You can think about what the other data points might be used in your controversy mapping project.      

<img src="https://res.cloudinary.com/dra3btd6p/image/upload/v1549470946/Mapping%20controversies%202019/revision_params.jpg" title="Category:circumcision" style="width: 500px;" /> 

In order to run the script, click on the cell below and press "Run" in the menu.

You can see the documentation for the revision API here: https://www.mediawiki.org/wiki/API:Revisions

In [None]:
import requests
import json
import csv
import datetime
import re
from geolite2 import geolite2
reader = geolite2.reader()

Revisions = []

S = requests.Session()

print('Enter the desired language version of wikipedia (e.g. "en","da","fr",etc.) or leave blank to use default (english):')

input_lan = input()
if not input_lan:
    lan="en"
else:
    lan=input_lan
print('Enter the name of the Wikipedia page you wish to query for revisions ')
page = input()
#page="circumcision"

print("Enter start date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2001-01-01")
input_date=input()
if not input_date:
    start_date="2001-01-01"
else:
    start_date=input_date

print("Define how you wan't to count the data (year or month). Leave blank to use default: month")
input_count=input()
if input_count:
    count_type=input_count
else:
    count_type="month"

print("Harvesting revision history...")    
URL = "http://"+lan+".wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "prop": "revisions",
    "titles": page,
    "rvlimit": "500",
    "rvprop": "timestamp|user|comment|slotsize|userid|ids|tags",
    "rvdir": "newer",
    "rvstart": start_date+"T00:00:00Z",
    "formatversion": "2",
    "format": "json"

}

R = S.get(url=URL, params=PARAMS)
if R.status_code==404:
    print("The page does not exist")
DATA = R.json()
for each in DATA['query']['pages']:
    Revisions.append(each)

while 'continue' in DATA.keys():
    PARAMS = {
        "action": "query",
        "prop": "revisions",
        "titles": page,
        "rvlimit": "500",
        "rvprop": "timestamp|user|comment|slotsize|userid|ids|tags",
        "rvdir": "newer",
        "rvstart": start_date+"T00:00:00Z",
        "formatversion": "2",
        "format": "json",
        "rvcontinue": DATA['continue']['rvcontinue']

    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    for each in DATA['query']['pages']:
        Revisions.append(each)
revisions=[]

for each in Revisions:
    if "revisions" in each:
        for every in each["revisions"]:
            if not "user" in every:
                every["user"]="n/a"
            if not "userid" in every:
                every["userid"]="n/a"
            if not "comment" in every:
                every["comment"]="n/a"
            if not "slotsize" in every:
                every["slotsize"]="n/a"
            if not "tags" in every:
                every["tags"]="n/a"
            if not "revid" in every:
                every["revid"]="n/a"
            if not "parentid" in every:
                every["parentid"]="n/a"

            
            revisions.append(every)

filename='MC_script_7_'+'revisions_'+page+'_from_'+start_date+'_to_now'
json_path = filename+'_all.json'
csv_path=filename+'_all.csv'

print("Done harvesting revision history. Looking up geolocations for anonymous users")
for each in revisions: 
    user=each["user"]
    if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",user):
        try:
            geo=(reader.get(user))
            if "location" in geo: 
                lat=geo["location"]["latitude"]
                long=geo["location"]["longitude"]
            else: 
                lat=""
                long=""
        except:
            lat=""
            long=""
    else:
        lat=""
        long=""
    each["latitude"]=lat
    each["longitude"]=long
    
with open(json_path, 'w') as outfile:
    json.dump(revisions, outfile)

headers=['revision_id','parent_id', 'user_name', 'user_id', 'timestamp', 'yyyy-mm-dd','yyyy','mm','dd','size','comment', 'tags','latitude','longitude']
csv_list=[headers]

for each in revisions:
    date=each['timestamp'].split('T')[0]
    year=date.split('-')[0]
    month=date.split('-')[1]
    day=date.split('-')[2]
    entry=[each["revid"], each["parentid"],each["user"],each["userid"],each["timestamp"],date,year,month,day,each["size"],each["comment"],each["tags"],each["latitude"],each["longitude"]]
    csv_list.append(entry)

with open(csv_path,"w", newline='',encoding='utf-8') as f:
    wr = csv.writer(f, delimiter=";")
    wr.writerows(csv_list)        
start_year=int(start_date.split("-")[0])
years=[]
for each in range(start_year,2021):
    years.append(str(each))
    
months=['01', '02','03','04','05','06','07','08','09','10','11','12']

quarters=[]

dict_of_years={}
csv_path_count=filename+'_count_'+count_type+'.csv'
headers=["Time period ("+count_type+")", "Revision count", "Unique users count", "Title"]
with open(csv_path_count,"w", newline='',encoding='utf-8') as f:
    wr = csv.writer(f, delimiter=",")
    wr.writerow(headers)
for year in years:

    dict_of_years[year]={}
    if not year=="2020":
        for month in months:
            dict_of_years[year][month]={"users":[]}
    else: 
        for month in months[:int(str(datetime.datetime.now()).split("-")[1])]:
            dict_of_years[year][month]={"users":[]}
json_path_count=filename+'_count_'+count_type+'.json'



for revision in revisions:
    timestamp=revision["timestamp"]
    user_id=revision["userid"]
    year=timestamp.split('-')[0]
    month=timestamp.split('-')[1]
    dict_of_years[year][month]["users"].append(user_id)
with open(json_path_count, 'w') as outfile:
    json.dump(dict_of_years, outfile)
csv_list=[]
if count_type.lower()=="month":
    for year in dict_of_years:
        for month in dict_of_years[year]:
            entry=[year+"-"+month, len(dict_of_years[year][month]["users"]), len(set(dict_of_years[year][month]["users"])),page ]
            csv_list.append(entry)


if count_type.lower()=="year":
    for year in dict_of_years:
        users=[]
        
        for month in dict_of_years[year]:
            users=users+dict_of_years[year][month]["users"]
        entry=[year, len(users), len(set(users)),page ]
        csv_list.append(entry)
with open(csv_path_count,"a", newline='',encoding='utf-8') as f:
    wr = csv.writer(f, delimiter=",")
    wr.writerows(csv_list)
locale=!pwd
print("The script is done. Find the outputs here:")
print(locale[0]+"/"+csv_path_count)
print(locale[0]+"/"+json_path_count)
print(locale[0]+"/"+csv_path)
print(locale[0]+"/"+json_path)



## Harvest revision timeline for all pages in one or multiple category json 

In [None]:
import requests
import json
import csv
import datetime
import re
from geolite2 import geolite2
reader = geolite2.reader()

Revisions = []

S = requests.Session()

print('Enter the desired language version of wikipedia (e.g. "en","da","fr",etc.) or leave blank to use default (english):')

input_lan = input()
if not input_lan:
    lan="en"
else:
    lan=input_lan
print('Enter the name of the Wikipedia category json you wish to query for revisions (e.g.category_members_circumcision_depth_2). If you want to harvest from multiple category json files, separate them with a comma')
cat = input()
#page="circumcision"
pages=[]
if "," in cat:
    for each in cat.split(","):
        if each:
            each=each.strip()
            if not each.endswith(".json"):
                path=each+".json"
            else: 
                path=each
                each=each.split(".")[0]

            with open(path) as jsonfile:
                cat_members = json.load(jsonfile)
                jsonfile.close()
            
            for every in cat_members:
                pages.append(every["title"])
        else:
            continue

else:
    if not cat.endswith(".json"):
        path=cat+".json"
    else: 
        path=cat

    with open(path) as jsonfile:
        cat_members = json.load(jsonfile)
        jsonfile.close()
    for every in cat_members:
        pages.append(every['title'])
    
print("Enter start date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2001-01-01")
input_date=input()
if not input_date:
    start_date="2001-01-01"
else:
    start_date=input_date
revisions=[]
print("Starting harvest of revision history for "+str(len(pages))+" pages")
for page in pages:
    Revisions=[]

    print("Harvesting revision history for "+page)    
    URL = "http://"+lan+".wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "prop": "revisions",
        "titles": page,
        "rvlimit": "500",
        "rvprop": "timestamp|user|comment|slotsize|userid|ids|tags",
        "rvdir": "newer",
        "rvstart": start_date+"T00:00:00Z",
        "formatversion": "2",
        "format": "json"

    }

    R = S.get(url=URL, params=PARAMS)
    if R.status_code==404:
        print("The page does not exist")
    DATA = R.json()
    for each in DATA['query']['pages']:
        Revisions.append(each)

    while 'continue' in DATA.keys():
        PARAMS = {
            "action": "query",
            "prop": "revisions",
            "titles": page,
            "rvlimit": "500",
            "rvprop": "timestamp|user|comment|slotsize|userid|ids|tags",
            "rvdir": "newer",
            "rvstart": start_date+"T00:00:00Z",
            "formatversion": "2",
            "format": "json",
            "rvcontinue": DATA['continue']['rvcontinue']

        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        for each in DATA['query']['pages']:
            Revisions.append(each)

    for each in Revisions:
        if "revisions" in each:
            for every in each["revisions"]:
                if not "user" in every:
                    every["user"]="n/a"
                if not "userid" in every:
                    every["userid"]="n/a"
                if not "comment" in every:
                    every["comment"]="n/a"
                if not "slotsize" in every:
                    every["slotsize"]="n/a"
                if not "tags" in every:
                    every["tags"]="n/a"
                if not "revid" in every:
                    every["revid"]="n/a"
                if not "parentid" in every:
                    every["parentid"]="n/a"
                every["page"]=page

                revisions.append(every)
filename='MC_script_7_'+'revisions_'+cat+'_from_'+start_date+'_to_now'
json_path = filename+'_all.json'
csv_path=filename+'_all.csv'

for each in revisions: 
    user=each["user"]
    if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",user):
        try:
            geo=(reader.get(user))
            if "location" in geo: 
                lat=geo["location"]["latitude"]
                long=geo["location"]["longitude"]
            else: 
                lat=""
                long=""
        except:
            lat=""
            long=""
    else:
        lat=""
        long=""
    each["latitude"]=lat
    each["longitude"]=long
    
with open(json_path, 'w') as outfile:
    json.dump(revisions, outfile)

headers=['revision_id','parent_id', 'user_name', 'user_id', 'timestamp','size','comment', 'tags','latitude','longitude',"page"]
csv_list=[headers]

for each in revisions:
    date=each['timestamp'].split('T')[0]

    entry=[each["revid"], each["parentid"],each["user"],each["userid"],each["timestamp"],each["size"],each["comment"],each["tags"],each["latitude"],each["longitude"],each["page"]]
    csv_list.append(entry)

with open(csv_path,"w", newline='',encoding='utf-8') as f:
    wr = csv.writer(f, delimiter=";")
    wr.writerows(csv_list)        
locale=!pwd

print(locale[0]+"/"+csv_path)
print(locale[0]+"/"+json_path)


