# Mapping controversies script 9 extended more: Users' revisions outside category members sphere
This set of scripts will help you extend the user-revision networks to include all the revisions users make on pages outside the wikipedia category

__BEFORE YOU START! Take a moment to draw what this actually mean.__

## Step 1: Installing the right libraries
Libraries for Jupyter can be understood as preprogrammed script parts. This means, that instead of writing a lot of lines of code in order e.g. make contact to Wikipedia, you can do it in one command.


__Obs: in this workbook we will be using the requests library. If you have already installed it once, there is no need to do it again. You may simply skip to step 2.__

In [None]:
# In this cell Jupyter checks whether you have the right libraries installed 

import sys

try: #First, Jupyter tries to import a library
    import requests
    print("Requests library has been imported")
except: #If it fails, it will try to install the library
    print("Requests library not found. Installing...")
    !pip install requests
    try:#... and try to import it again
        import requests
    except: #unless it fails, and raises an error.
        print("Something went wrong in the installation of the requests library. Please check your internet connection and consult output from the installation below")
try:
    import networkx
    print("NetworkX library has been imported")
except:
    print("NetworkX library not found. Installing...")
    !pip install networkx
    
    try:
        import networkx
    except:
        print("Something went wrong in the installation of the NetworkX library. Please check your internet connection and consult output from the installation below")

        

## Step 2: Make a user list based on revisions on pages in a wikipedia category json file or single wikipedia pages
This step will make a list of all the users who make revisions on the pages you input. The list will be used later in the script

In order to run the script, click on the cell below and press "Run" in the menu.

In [1]:
import requests
import json
import csv
import networkx as nx
filename=""
print("How do you want to input the pages to use to make the user list?")
print("Enter '1' if you want to use a category members json file.")
print("Enter '2' if you want to enter the pages manually.")
print("Enter '0' if you want to use category members json file AND enter pages manually.")
pages=[]
input_style=input()
if input_style==str(1) or input_style==1 or input_style==0 or input_style==str(0):
    print("Enter the name of the category members json file you wish to make a user list from (e.g.cat_members_circumcision_depth_2). If you have multiple files separate them with a comma")
    filename= input()
    if "," in filename:
        
        for each in filename.split(","):
            

            if not each.endswith(".json"):
                path=each+".json"
            else: 
                path=each
                each=each.split(".")[0]
            with open(path) as jsonfile:
                cat_members = json.load(jsonfile)
                jsonfile.close()
            for every in cat_members:
                pages.append(every['title'])
    else:
        print(" ")
        

        if not filename.endswith(".json"):
            path=filename+".json"
        else: 
            path=filename
            filename=filename.split(".")[0]
        with open(path) as jsonfile:
            cat_members = json.load(jsonfile)
            jsonfile.close()
        for each in cat_members:
            pages.append(each['title'])
    
if input_style==str(2) or input_style==2 or input_style==0 or input_style==str(0):
    print("Enter the names of the pages you wish to make a user list from. If multiple pages use comma separation (e.g. circumcision,Female genital mutilation etc)")
    raw_input=input()
    
    if "," in raw_input:
        for each in raw_input.split(","):
            pages.append(each)
    else:
        pages.append(raw_input)

S = requests.Session()

print('Enter the desired language version of wikipedia (e.g. "en","da","fr",etc.) or leave blank to use default (english):')

input_lan = input()
if not input_lan:
    lan="en"
else:
    lan=input_lan

print("Enter start date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2001-01-01")
start_date=input()
if not start_date:
    start_date="2001-01-01"
print("Enter end date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2020-03-01")
end_date=input()
if not end_date:
    end_date="2020-03-01"
blacklist=[]
blacklisted_users=[]

print("Enter user names you want to blacklist. If you want to blacklist multiple users, use comma separation.")
raw_blacklist=input()
raw_blacklist=""
if "," in raw_blacklist:
    for each in raw_blacklist.split(","):
        blacklisted_users.append(each.strip())
else: 
    blacklisted_users.append(raw_blacklist.strip())

print("Harvesting data from "+str(len(pages))+" input pages...")
count=1
users=[]
user_dict={}
for page in pages:
    Revisions = []
    URL = "http://"+lan+".wikipedia.org/w/api.php"
    if count % 50 == 0:
        print("Data harvested from "+str(count)+" pages out of "+str(len(pages))+". Continuing harvest...")
    PARAMS = {
        "action": "query",
        "prop": "revisions",
        "titles": page,
        "rvlimit": "500",
        "rvprop": "user|timestamp",
        "rvdir": "newer",
        "rvstart": start_date+"T00:00:00Z",
        "rvend": end_date+"T00:00:00Z",
        "formatversion": "2",
        "format": "json"

    }

    R = S.get(url=URL, params=PARAMS)
    if R.status_code==404:
        print("The page does not exist. Skipping...")
        continue
    DATA = R.json()
    for each in DATA['query']['pages']:
        Revisions.append(each)

    while 'continue' in DATA.keys():
        PARAMS = {
            "action": "query",
            "prop": "revisions",
            "titles": page,
            "rvlimit": "500",
            "rvprop": "user|timestamp",
            "rvdir": "newer",
            "rvstart": start_date+"T00:00:00Z",
            "rvend": end_date+"T00:00:00Z",
            "formatversion": "2",
            "format": "json",
            "rvcontinue": DATA['continue']['rvcontinue']

        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        for each in DATA['query']['pages']:
            Revisions.append(each)


    for each in Revisions:
        if "revisions" in each:
            for every in each["revisions"]:
                if "user" in every:
                    if every["user"] not in blacklisted_users:
                        users.append(every["user"])
                        if every["user"] in user_dict:
                            user_dict[every["user"]]["timestamps"].append(every["timestamp"])

                        else:
                            user_dict[every["user"]]={"timestamps":[every["timestamp"]]}
    count=count+1
print("Done harvesting users")

print("There are "+str(len(set(users)))+" unique users making revisions in the category/page")
print("The users are stored in memory, and you may continue to step 3 or 4")
print("Dumping JSON file with all users. ")
filename_json="users_making_revisions_"+start_date+"_to_"+end_date+".json"

with open(filename_json, 'w') as outfile:
    json.dump(users, outfile)

How do you want to input the pages to use to make the user list?
Enter '1' if you want to use a category members json file.
Enter '2' if you want to enter the pages manually.
Enter '0' if you want to use category members json file AND enter pages manually.
2
Enter the names of the pages you wish to make a user list from. If multiple pages use comma separation (e.g. circumcision,Female genital mutilation etc)
COVID-19 pandemic lockdowns
Enter the desired language version of wikipedia (e.g. "en","da","fr",etc.) or leave blank to use default (english):
en
Enter start date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2001-01-01

Enter end date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2020-03-01

Enter user names you want to blacklist. If you want to blacklist multiple users, use comma separation.

Harvesting data from 1 input pages...
Done harvesting users
There are 0 unique users making revisions in the category/page
The users 

## Step 3: Export timeline information on top active users
This script will output a csv file with the top n users counted on revision history, in order to make a simple timeline in tableau or a spreadsheet editor

In [3]:
from collections import Counter
import datetime
cnt=Counter(users)
print("How many top users do you wan't to include?")
top=int(input())

start_year=int(start_date.split("-")[0])
end_year=int(end_date.split("-")[0])
years=[]
for each in range(start_year,end_year+1):
    years.append(str(each))
    
months=['01', '02','03','04','05','06','07','08','09','10','11','12']
top_users=[]
for each in cnt.most_common(top):
    top_users.append(each[0])

dict_of_years={}
csv_path_count='top_'+str(top)+'_user_revisions_count_month.csv'
headers=["user","Time period (months)", "Revision count"]

csv_total=[headers]
for user in top_users:
    dict_of_years[user]={"years":{}}
    for year in years:
        dict_of_years[user]["years"][year]={}
        if not year=="2020":
            for month in months:
                dict_of_years[user]["years"][year][month]={"count":0}
        else: 
            for month in months[:int(str(datetime.datetime.now()).split("-")[1])]:
                dict_of_years[user]["years"][year][month]={"count":0}

for user in top_users:
    timestamps=user_dict[user]["timestamps"]
    for timestamp in timestamps:
        user_id=user
        year=timestamp.split('-')[0]
        month=timestamp.split('-')[1]
        dict_of_years[user]["years"][year][month]["count"]=dict_of_years[user]["years"][year][month]["count"]+1
for user in dict_of_years:
    for year in dict_of_years[user]["years"]:
        for month in dict_of_years[user]["years"][year]:
            count=dict_of_years[user]["years"][year][month]["count"]
            period=str(year)+"-"+str(month)
            csv_list=[user,period,count]
            csv_total.append(csv_list)
with open(csv_path_count,"w", newline='',encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=";")
    writer.writerows(csv_total)
print("done")

How many top users do you wan't to include?
10
done


## Step 4: Harvest ALL revisions from the users on the user list

This script will harvest all revisions made by the users, who make revisions within the category, categories or pages you inputted in step 2. 

__Be aware, that including many users in combination with a wide date range might increase the time it takes to run the script dramastically__ 

It is impossible to say how long a harvest will take, as we dont know in advance how many revisions users make. But as a rule of thumb, if you wan't to harvest activity from all the users, you should limit the date range to a week or so, and evaluate how long it takes. You can then increase the date range to e.g. a month. __You will be able to set a net date range below__.
Alternatively, the script allows you to filter out users from the list based on their activity (high/low), and make a randomized sample. 

It might not always be the most active users, that are the most interesting. What do you think characterises the most active users? 


In [None]:
from collections import Counter

import random
import sys
import json
import requests
import os
import time

print("Before you start, please read the text above!")

print('Enter the desired language version of wikipedia (e.g. "en","da","fr",etc.) or leave blank to use default (english):')

input_lan = input()
if not input_lan:
    lan="en"
else:
    lan=input_lan

print("Enter user names you want to blacklist. If you want to blacklist multiple users, use comma separation.")
raw_blacklist=""
raw_blacklist=input()

blacklisted_users=[]
if "," in raw_blacklist:
    for each in raw_blacklist.split(","):
        blacklisted_users.append(each.strip().lower())
else: 
    blacklisted_users.append(raw_blacklist.strip().lower())

if 'users' in locals():
    print("The user list currently include "+str(len(set(users)))+" unique users.")

else: 
    sys.exit("Go to step 2")
    


new_users=[]

print("Do you wan't to exclude users that has 'bot' in its name (y/n)?")
bot=input()
if bot=="y":
    for user in list(set(users)):
        if "bot" not in user.lower():
            new_users.append(user)
    print("The user list now contains "+str(len(new_users))+" users")
users=new_users
new_users=[]
print("Would you like to filter the users (y/n)? (Most likely yes!)")

filter_users=input()             
if filter_users.lower()=="y":
    print("Select how you wan't to filter the users")
    print("Enter '1' if you ONLY wan't to include the top n most active users (where 'n' will be set later)")
    print("Enter '2' if you ONLY wan't to include the n least active users (where 'n' will be set later)")
    print("Enter '3' if you wan't to make a randomized sample of n users (where 'n' will be set later)")
    choice=input()
    print("Enter the number of users you wan't to include")
    limit=input()
    if limit:
        limit=int(limit)
        if limit>len(set(users)):
            limit=len(set(users))
    else:
        limit=len(set(users))

else:
    choice=""
    for user in list(set(users)):
        if user.lower() not in blacklisted_users:
            new_users.append(user)
print("Enter a limit for how many revisions you would like to include per user (will round up to nearest 500 per user). Leave blank for no limit:")
user_limit=input()
if user_limit:
    user_limit=int(user_limit)
else:
    user_limit=100000000


print("Enter start date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2001-01-01")
start_date=input()
if not start_date:
    start_date="2001-01-01"
print("Enter end date for revisions in the format: yyyy-mm-dd. Leave blank to use default date: 2020-03-01")
end_date=input()
if not end_date:
    end_date="2020-03-01"

cnt=Counter(users)
#len(cnt)

print("The script will make a folder to which the exports will be saved. What would you like to call it? Leave blank for default 'json dumps user revisions'.")
print("IMPORTANT NOTE: if you have run the script before, DO NOT use the same folder, as the script will overwrite the content!!")


K=True
while K:
    #raw_dump_path=input()
    raw_dump_path=input()
    if not raw_dump_path:
        raw_dump_path="json dumps user revisions"
    dump_path=raw_dump_path+"/"
    try:
        os.mkdir(dump_path)
        K=False
    except FileExistsError:
        print("The folder already exists. Please enter another name")
        

ranged_cnt=cnt.most_common(len(cnt))
#ranged_cnt
if choice=="1":
    for each in ranged_cnt[:limit]:
        if each[0].lower() not in blacklisted_users:
                
            new_users.append(each[0])
if choice=="2":
    for each in ranged_cnt[len(cnt)-limit:]:
        if each[0].lower() not in blacklisted_users:
            new_users.append(each[0])
if choice=="3":
    random_indexes=[]
    for x in range(limit):
        random_indexes.append(random.randint(0,len(cnt)-1))
    for random_index in random_indexes:
        if ranged_cnt[random_index][0].lower() not in blacklisted_users:
            new_users.append(ranged_cnt[random_index][0])

    
S = requests.Session()


print("Harvesting data from "+str(len(new_users))+" users")
total_rev_counts=0
dumpcount=1
for user in new_users:
    if new_users.index(user) % 1000 == 0 and new_users.index(user)!=0:
        print("The script has harvested "+str(total_rev_counts)+" revisions from "+str(new_users.index(user))+" users.")

        json_dump_name=dump_path+"json_dump_"+str(dumpcount)+".json"

        with open(json_dump_name, 'w') as outfile:
            json.dump(user_dict, outfile)
        dumpcount=dumpcount+1
        user_dict={}
    if user.lower() not in blacklisted_users:
        user_count=0

        user_dict[user]={}
        URL = "http://"+lan+".wikipedia.org/w/api.php"

        PARAMS = {
            "action": "query",
            "list": "allrevisions",

            "arvlimit": "500",
            "arvprop": "user|ids|comment|timestamp|size",
            "arvdir": "newer",
            "arvstart": start_date+"T00:00:00Z",
            "arvend": end_date+"T00:00:00Z",
            "arvuser":user,
            "formatversion": "2",
            "format": "json"
        }
        try:
            R = S.get(url=URL, params=PARAMS)
            DATA = R.json()
        except:
            try:
                time.sleep(5)
                R = S.get(url=URL, params=PARAMS)
                DATA = R.json()
            except:
                sys.exit("Something went wrong. Exiting...")
        if DATA["query"]["allrevisions"]:
            for page in DATA['query']['allrevisions']:
                page_id=page["pageid"]
                page_title=page["title"]

                if "talk" in page_title.lower():
                    page_title=page_title.lower().split(":")[len(page_title.lower().split(":"))-1]

                if page_title.lower() in pages:
                    in_cat="yes"
                else:
                    in_cat="no"
                page_revisions=[]
                for revision in page["revisions"]:
                    page_revisions.append(revision)
                    total_rev_counts=total_rev_counts+1
                if not page_id in user_dict[user]:
                    
                    user_dict[user][page_id]={"page_title":page_title, "is_page_in_cat":in_cat,"page_revisions":page_revisions}
                    user_count=user_count+1
                else:
                    for revision in page_revisions:
                        
                        user_dict[user][page_id]["page_revisions"].append(revision)
                        user_count=user_count+1
        if user_count>int(user_limit):
            continue

        while 'continue' in DATA.keys() and user_count<user_limit:
            PARAMS = {
            "action": "query",
            "list": "allrevisions",

            "arvlimit": "500",
            "arvprop": "user|ids|comment|timestamp|size",
            "arvdir": "newer",
            "arvstart": start_date+"T00:00:00Z",
            "arvend": end_date+"T00:00:00Z",
            "arvuser":user,
            "formatversion": "2",
            "format": "json",
            "arvcontinue": DATA['continue']['arvcontinue']
            }

            try:
                R = S.get(url=URL, params=PARAMS)
                DATA = R.json()
            except:
                try:
                    time.sleep(5)
                    R = S.get(url=URL, params=PARAMS)
                    DATA = R.json()
                except:
                    sys.exit("Something went wrong. Exiting...")                    
            if DATA["query"]["allrevisions"]:
                for page in DATA['query']['allrevisions']:
                    page_id=page["pageid"]
                    page_title=page["title"]
                    if "talk" in page_title.lower():
                        page_title=page_title.lower().split(":")[len(page_title.lower().split(":"))-1]

                    for cat_page in pages:
                        if page_title.lower() == cat_page.lower():
                            in_cat="yes"
                        else:
                            in_cat="no"
                    page_revisions=[]
                    for revision in page["revisions"]:
                        page_revisions.append(revision)
                        total_rev_counts=total_rev_counts+1
                    if not page_id in user_dict[user]:
                        
                        user_dict[user][page_id]={"page_title":page_title,"is_page_in_cat":in_cat, "page_revisions":page_revisions}
                        user_count=user_count+1
                    else:
                        for revision in page_revisions:
                            
                            user_dict[user][page_id]["page_revisions"].append(revision)
                            user_count=user_count+1

json_dump_name=dump_path+"json_dump_"+str(dumpcount)+".json"

with open(json_dump_name, 'w') as outfile:
    json.dump(user_dict, outfile)

print("You have harvested "+str(total_rev_counts)+" revisions. ")

print("The script is done! Go to step 5 to export networks")

## Step 5: Built bi partite network

In [None]:
# Built bi-partite network
import networkx as nx
import os


if "dump_path" in locals():
    
    if os.path.exists(dump_path):
        print("The script has identified this folder containing json files: "+dump_path)

        print("If it is not correct, please enter the correct foldername below. If it is correct, leave it blank")
        hg=input()
        if hg:
            dump_path=hg+"/"
    else:
        print("The script could not identify the folder containing the json files from step 3 ")
    
        print("Please enter the correct foldername below")
        hg=input()
        if hg:
            dump_path=hg+"/"
else:
    print("The script could not identify the folder containing the json files from step 3 ")
    
    print("Please enter the correct foldername below")
    hg=input()
    if hg:
        dump_path=hg+"/"

print("Do you wan't to introduce an edge filter (y/n)?")
filter_c=input()
if filter_c.lower()=="y":
    print("Enter the minimum weight between a user and a page (leave blank for no filter): ")
    edge_filter_low=input()
    if not edge_filter_low:
        edge_filter_low=0
    print("Enter the maximum weight between a user and a page (leave blank for no filter): ")
    edge_filter_high=input()
    if not edge_filter_high:
        edge_filter_high=1000000000
        
print("Do you wan't to filter out Wikipedia About pages (y/n)? (e.g. https://en.wikipedia.org/wiki/Wikipedia:Reliable_sources/Noticeboard)")
filter_about=input()
print("What would you like to call the network?")
name=input()
if name:
    name=name+"_"
else: 
    name=""

G = nx.Graph()
edges=[]
users_=[]
print("Generating network...")
filenames=os.listdir(dump_path)
for filename in filenames:
    path_to_json=dump_path+filename
    if filename.endswith(".json"):
        with open(path_to_json) as jsonfile:
            user_dict = json.load(jsonfile)
            jsonfile.close()
    else:
        continue

    for user in user_dict:
        users_.append(user)
    for user in user_dict:
        for page in user_dict[user]:
            if filter_about.lower()=="y":    
                page_title=user_dict[user][page]["page_title"]
                if "wikipedia:" in page_title.lower():
                    continue
            weight=len(user_dict[user][page]["page_revisions"])
            if weight<=int(edge_filter_low):
                continue
            if weight>=int(edge_filter_high):
                continue
            edges=[]
            edge = (user,page,{"weight":weight})
            edges.append(edge)
            G.add_edges_from(edges)
            G.nodes[page]["is_page_in_cat"]=user_dict[user][page]["is_page_in_cat"]
            G.nodes[page]["page_title"]=user_dict[user][page]["page_title"]
            G.nodes[page]["label"]=user_dict[user][page]["page_title"]
            G.nodes[page]["type"]="page"
            G.nodes[user]["type"]="user"
            G.nodes[user]["label"]=user
            G.nodes[user]["is_page_in_cat"]="N/A"


nx.write_gexf(G, name+'user2page_revision_bipartite.gexf')
print('Bipartite network saved. You can find the network by following this path: ')
