# Mapping controversies script 8: Explore revision timelines and geographical location of anonymous users

The first time you run the script, the aim is to make a timeline over the entire period of the Wikipedia page.
You can use this timeline to identify interesting spikes, which may then be used to narrow in your search in order to output a more specific time period when you run the script again. 

- Step 1: Installing the libraries
- Step 2: Load the data
- Step 3: Make a timeline of user revisions
- Step 4: Make a timeline of unique users making revisions
- Step 5:
- Step 5: Output the most active users with their stats
- Step 6: Review the "Talk" page to see what the revision users are talking about

## Step 1: Installing the right libraries
Libraries for Jupyter can be understood as preprogrammed script parts. This means, that instead of writing a lot of lines of code in order e.g. make contact to Wikipedia, you can do it in one command.



In [None]:
import sys

try: #First, Jupyter tries to import a library
    import matplotlib
    print("matplotlib library has been imported")
except: #If it fails, it will try to install the library
    print("matplotlib library not found. Installing...")
    !pip install matplotlib
    try:#... and try to import it again
        import matplotlib
    except: #unless it fails, and raises an error.
        print("Something went wrong in the installation of the matplotlib library. Please check your internet connection and consult output from the installation below")


## Step 2: Load your data

In order to run the script, click on the cell below and press "Run" in the menu.


In [None]:
import requests
import json
import csv
import datetime


print("Enter the name of the page revision json file you wish to explore further (e.g.: MC_script_7_revisions_circumcision_from_2001-01-01_to_now_all) ")
print("")
filename= input()
print(" ")

print("READ THIS!: You can now choose to narrow down the period of exploration or keep the original interval contained in the revision json file. If this is the first time you run the script, you should keep the original interval. You cannot go beyond the interval of the json file...")

print("Do you wan't to narrow down the period of exploration (y/n)?")
narrow=input()
if narrow.lower()=="y":
    print("What is the new start date (yyyy-mm-dd)?")
    start_date_input=input()
    print("What is the new end date (yyyy-mm-dd)?")
    end_date_input=input()
    
if not narrow:
    narrow="n"
if not filename.endswith(".json"):
    path=filename+".json"
else: 
    path=filename
    filename=filename.split(".")[0]
    
with open(path) as jsonfile:
    revisions = json.load(jsonfile)

dict_of_years={}
if narrow.lower()=="n":
    start_date=filename.split("from_")[1].split("_")[0]
    start_year=int(start_date.split("-")[0])
    end_year="2020"
    end_month=(str(datetime.datetime.now()).split("-")[1])
    end_date=end_year+"-"+end_month+"-01"
    start_month=(start_date.split("-")[1])
if narrow.lower()=="y":
    start_date=start_date_input
    start_year=int(start_date.split("-")[0])
    start_month=(start_date.split("-")[1])
    end_date=end_date_input
    end_year=int(end_date.split("-")[0])
    end_month=(end_date.split("-")[1])
years=[]
for each in range(start_year,int(end_year)+1):
    years.append(str(each))
    
months=['01', '02','03','04','05','06','07','08','09','10','11','12']




for year in years:

    dict_of_years[year]={}
    if year==str(start_year):
        for month in months[months.index(str(start_month)):]:
            dict_of_years[year][month]={"users":[]}
    elif year==str(end_year):
        for month in months[:months.index(str(end_month))]:
            dict_of_years[year][month]={"users":[]}
    else: 
        for month in months:
            dict_of_years[year][month]={"users":[]}


user_ids=[]
for revision in revisions:
    timestamp=revision["timestamp"]
    user_id=revision["userid"]
    user_ids.append(user_id)
    year=timestamp.split('-')[0]
    month=timestamp.split('-')[1]
    if year not in years:
        continue
    if month not in dict_of_years[year]:
        continue
    
    dict_of_years[year][month]["users"].append(user_id)
locale=!pwd
print("The data has been loaded. Continue to step 3, 4 or 5")




## Step 3: Make a simple timeline of user revisions
OBS: The timeline will be based on the interval you chose in step 2!

In [None]:
import matplotlib.pyplot as plt

print("Define how you wan't to count the data (year or month)")
input_count=input()
if input_count:
    count_type=input_count
else:
    count_type="month"
print("What would you like to call the timeline?")
timeline_name=input()
timeline_name=timeline_name+"_"+start_date+"_"+end_date+"_UserRevisions.png"
print("Making timeline..")
time_list=[]
revisions_list=[]
if count_type.lower()=="month":
    for year in dict_of_years:
        for month in dict_of_years[year]:
            time_list.append(year+"-"+month)
            revisions_list.append(len(dict_of_years[year][month]["users"]))#, len(set(dict_of_years[year][month]["users"])),page ]
if count_type.lower()=="year":
    for year in dict_of_years:
        users=[]
        
        for month in dict_of_years[year]:
            users=users+dict_of_years[year][month]["users"]
        revisions_list.append(len(users))
        time_list.append(year)

            
%matplotlib inline
plt.style.use('seaborn-whitegrid')


fig = plt.figure(figsize=(len(time_list)*0.30,14*len(time_list)*0.007))
ax = plt.axes()
ax.plot(time_list, revisions_list);
fig.suptitle("Number of revisions", fontsize=round(60*len(time_list)*0.009))

fig.autofmt_xdate()
fig.set
fig.savefig(timeline_name, dpi=100)

print("Your timeline has been saved. Look in the folder of this script")

## Step 4: Make a simple timeline of unique users making revisions
OBS: The timeline will be based on the interval you chose in step 2!

In [None]:

print("Define how you wan't to count the data (year or month)")
input_count=input()
if input_count:
    count_type=input_count
else:
    count_type="month"
print("What would you like to call the timeline?")
timeline_name=input()
timeline_name=timeline_name+"_"+start_date+"_"+end_date+"_UniqueUsersRevising.png"
print("Making timeline..")
time_list=[]
revisions_list=[]
if count_type.lower()=="month":
    for year in dict_of_years:
        for month in dict_of_years[year]:
            time_list.append(year+"-"+month)
            revisions_list.append(len(set(dict_of_years[year][month]["users"])))#, len(set(dict_of_years[year][month]["users"])),page ]
if count_type.lower()=="year":
    for year in dict_of_years:
        users=[]
        
        for month in dict_of_years[year]:
            users=users+dict_of_years[year][month]["users"]
        revisions_list.append(len(set(users)))
        time_list.append(year)

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

fig = plt.figure(figsize=(len(time_list)*0.30,14*len(time_list)*0.007))
ax = plt.axes()
ax.plot(time_list, revisions_list);
fig.autofmt_xdate()
fig.suptitle("Number of unique users making revisions", fontsize=round(60*len(time_list)*0.009))
fig.set
fig.savefig(timeline_name, dpi=100)
print("Your timeline has been saved. Look in the folder of this script")

## Step 5: Output the most active users with their stats:
OBS: The output will be based on the interval you chose in step 2!
Stats include: Number of revisions, groups they are part of, gender (if entered) and if they have been blocked

In [None]:
from collections import Counter
import requests

print("How many users to you wan't information on (max 50)?")
user_count=input()
if int(user_count)>50:
    user_count=50
    print("You have asked for too many user stats. Lowering to 50..")
print("What would you like to call your file?")
input_name=input()
csv_path="Top_"+str(user_count)+"_most_active_users_"+input_name+".csv"
user_dict={}
cnt=Counter(user_ids)
mc=cnt.most_common(50)
usr_list=[]
for user in mc:
    user_dict[user[0]]={"revisions_in_set":user[1]}
    usr_list.append(user[0])
long_user=''

for user in usr_list:
    if str(user)!="0":
        if not usr_list.index(user)==len(usr_list)-1:
            long_user=long_user+str(user)+'|'
        else:
            long_user=long_user+str(user)

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "list": "users",
    "ususerids": long_user,
    "usprop": "groups|editcount|registration|gender|blockinfo"
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

csv_headers=["user_id", "user_name", "groups", "gender", "edit_count_total", "edit_count_current_page", "registration_date"]
with open(csv_path,"w", newline='',encoding='utf-8') as f:
    wr = csv.writer(f, delimiter=",")
    wr.writerow(csv_headers)
for each in DATA["query"]["users"]:
    user_dict[each["userid"]]["edit_count"]=each["editcount"]
    user_dict[each["userid"]]["registration"]=each["registration"]
    user_dict[each["userid"]]["groups"]=each["groups"]
    user_dict[each["userid"]]["gender"]=each["gender"]
    user_dict[each["userid"]]["name"]=each["name"]
    csv_list=[each["userid"],each["name"],each["groups"],each["gender"],each["editcount"],user_dict[each["userid"]]["revisions_in_set"],each["registration"]]
    with open(csv_path,"a", newline='',encoding='utf-8') as f:
        wr = csv.writer(f, delimiter=",")
        wr.writerow(csv_list)
print("Your file has been saved in the same folder as this script is running from. ")

## Step 6: Review the "Talk" page to see what the revision users are talking about.
The Talk page of a wikipedia page is where the users discuss and argue about changes and revisions. You can see an example [here](https://en.wikipedia.org/wiki/Talk:Circumcision/). If you open the csv file with all the revisions (e.g. MCTutorial4_revisions_circumcision_from_2001-01-01_to_now_all.csv), you might notice that the users often refer to the "Talk" page in their comments to revisions. We can review the talk pages, to get a better understanding of what is at stake!
In order to limit the amount of "Talk" we need to review, you can enter a start and end date, 


In [None]:
import requests
import csv
print("What page do you want to harvest Talk from?")
page_name=input()
print("")
print("Please be careful aout choosing a long interval, as it might take a while to collect the data. Recommended interval is 1 month.")
print("Enter the start date for Talk on "+page_name+" (yyyy-mm-dd):")
start_date=input()
print("Enter the end date for Talk on "+page_name+" (yyyy-mm-dd):")
end_date=input()

#print("Do you want do limit to top "+str(user_count)+" users (y/n)?")
#top=input()
top="n"
S = requests.Session()


URL = "https://wikipedia.org/w/api.php"
ok=1
page_name_under=page_name.replace(" ", "_")
if top.lower()=="n":
    PARAMS = {
        "action": "query",
        "prop": "revisions",
        "titles": "talk:"+page_name,
        "rvprop": "timestamp|ids",
        "rvslots": "main",
        "rvdir": "newer",
        "rvlimit":"500",
        "rvstart": start_date+"T00:00:00Z",
        "rvend":end_date+"T00:00:00Z",
        "formatversion": "2",
        "format": "json"
    }
    rev_ids=[]
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    for page in DATA["query"]["pages"]:
        if "revisions" in page:
            for each in page["revisions"]:
                rev_id=each["revid"]
                rev_ids.append(rev_id)    
    while 'continue' in DATA.keys() and ok:
        PARAMS = {
            "action": "query",
            "prop": "revisions",
            "titles": "talk:"+page_name,
            "rvlimit": "500",
            "rvprop": "timestamp|user|ids",
            "rvdir": "newer",
            "rvstart": start_date+"T00:00:00Z",
            "rvend":end_date+"T00:00:00Z",
            "formatversion": "2",
            "format": "json",
            "rvcontinue": DATA['continue']['rvcontinue']

        }


        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        for page in DATA['query']['pages']:
            for each in page["revisions"]:
                rev_id=each["revid"]
                rev_ids.append(rev_id)
if len(set(rev_ids))==0:
    print("The script harvested "+str(len(set(rev_ids)))+" unique talks. Try another interval")
else:
    print("The script harvested "+str(len(set(rev_ids)))+" unique talks.")
csv_path="Talk_from_"+page_name+"_from_"+start_date+"_to_"+end_date+".csv"

csv_headers=["Talk id", "Talk url", "page name"]
with open(csv_path,"w", newline='',encoding='utf-8') as f:
    wr = csv.writer(f, delimiter=",")
    wr.writerow(csv_headers)
for each in list(set(rev_ids)):
    url_="https://en.wikipedia.org/w/index.php?title=Talk:"+page_name_under+"&oldid="+str(each)
    
    csv_list=[each, url_, page_name_under]
    with open(csv_path,"a", newline='',encoding='utf-8') as f:
        wr = csv.writer(f, delimiter=",")
        wr.writerow(csv_list)
        
print("Script is done. You can find your file in the folder.")