## Imports and configuration

In [1]:
import os, re, requests, threading, time, tldextract
import numpy as np
import pandas as pd
import pickle as pkl
import seaborn as sns
import wikipedia as wp
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from scipy import stats
from tqdm import tqdm

# Importing our custom functions
from wiki_workers import *

# Configuring matplotlib to output opaque images to avoid issues with dark mode
import matplotlib as mpl
mpl.rcParams["savefig.facecolor"] = "w"

# We do only read operations, therefore no user config is necessary.
# Normally the system crashes when there is no user config unless we tell it otherwise with this environment variable.
#   0 is default
#   1 means ignore the config
#   2 means ignore the config and don't throw warnings
os.environ["PYWIKIBOT_NO_USER_CONFIG"] = "2"

# Now we can import pywikibot
import pywikibot as pwb
import pywikibot.data.api as api

# Then we can setup references for Wikipedia and Wikidata
wiki_site = pwb.Site(code="en", fam="wikipedia")
data_site = pwb.Site(code="wikidata", fam="wikidata")
repo = data_site.data_repository()

# Setting paths
im_path = "graphs"

In [2]:
# We list here the search terms for EPFL
epfl_alts = [
    "EPFL",
    "École Polytechnique Fédérale de Lausanne",
    "Swiss Federal Institute of Technology in Lausanne",
    "EPF Lausanne",
    "ETH Lausanne",
    "Poly Lausanne",
]

# And here are some other universities so we can compare data
unil_alts = [
    "Unil",
    "Université de Lausanne",
    "Uni Lausanne",
    "University of Lausanne",
    "Lausanne University",
]

ethz_alts = [
    "ETHZ",
    "EPFZ",
    "Eidgenössische Technische Hochschule Zürich",
    "ETH Zurich",
    "Swiss Federal Institute of Technology in Zurich",
    "École Polytechnique Fédérale de Lausanne",
    "Poly Zurich"
]

mit_alts = [
    "MIT",
    "Massachusetts Institute of Technology",
    "Boston Tech",
]

Let's generate a list of all the pages that mention EPFL. We set `namespace=0` as this namespace is the one that contains regular pages. Note that searches from Pywikibot are ordered alphabetically by default, unlike what the Wikipedia API does.

In [None]:
epfl_pages = collect_pages(wiki_site, epfl_alts)

It is important to note that Wikipedia started storing page view statistics since July 1st, 2015. This means we will not have any data available before that time.

In [None]:
savePages("EPFL", epfl_pages)

In [3]:
epfl_pages = loadPages("EPFL")

In [None]:
[x for x in list(tableau[1])]

In [14]:
import sys
[sys.getsizeof(x) for x in tableau[0]]

[]

In [17]:
done = []
pulled_data = []
left = list(epfl_pages.items())

In [18]:
i = 0
print("Start")

try:
    while left:
        key, item = left.pop()
        pulled_data.append(item.revisions(content=True))
        done.append(key)
        i += 1
        print("Processed", i, end="\r")
except KeyboardInterrupt:
    print('interrupted!')

Start
Processed 11

Sleeping for 5.0 seconds, 2021-03-06 20:56:52


Processed 46



Processed 54



Processed 66



Processed 104



Processed 122



Processed 130

Sleeping for 5.0 seconds, 2021-03-06 22:58:26


Processed 132

Sleeping for 5.0 seconds, 2021-03-06 23:01:58


interrupted!3


In [8]:
tableau = []
for c, p in tqdm(epfl_pages.items()):
    tableau.append(p.revisions(content=True))

  0%|                                                                              | 3/5083 [01:33<44:00:30, 31.19s/it]


KeyboardInterrupt: 

In [None]:
epfl_pages_sample = dict([(key, epfl_pages[key]) for key in epfl_pages][40:60])

In [None]:
epfl_pages_sample = collect_pages(wiki_site, epfl_alts, limit=20)

In [None]:
%load_ext autoreload
%autoreload 2
from wiki_workers import *

In [None]:
for b in batches(list(epfl_pages_sample.items()), 2):
    x = b

In [None]:
uts = updateTimeSeries(epfl_pages_sample, epfl_alts, batch_size=20)

In [None]:
uts

In [None]:
print(len(uts))
uts[0]

In [None]:
list(epfl_pages_sample.items())[1][1].getOldVersion()

In [None]:
for _, page in epfl_pages_sample.items():
    r = list(page.revisions(reverse=False, content=False))

    name = "test"
    fn = name + ".pkl"
    path = fn

    with open(path, "wb") as f:
        pkl.dump(r, f)

    print(os.path.getsize(path))
    print(page.revision_count())

In [None]:
list(epfl_pages_sample.items())[1][1].getOldVersion(1006458075)

In [None]:
list(list(epfl_pages_sample.items())[1][1].revisions(content=False))

In [None]:
temppage = list(epfl_pages_sample.items())[2][1]
temppage

In [None]:
temprevs = list(temppage.revisions(content=False))

In [None]:
tempseries = pd.Series({r["timestamp"] : r["revid"] for r in temprevs}).groupby(pd.Grouper(freq="1M")).nth(-1)
tempseries

In [None]:
# Useful to get extra data but not for initial sweep
start = time.time()
[temppage.getOldVersion(x) for x in tempseries]
end = time.time()

print(end-start)

start = time.time()
temppage.revisions(content=True)
end = time.time()
print(end-start)

## Accessing a page

Let's look at the different ways we can refer to a given page. We will be using Martin Vetterli's page for our examples.

In [None]:
# We can get a page by name
page = pwb.Page(wiki_site, u"Martin Vetterli")
page

In [None]:
# We can get its data reference
item = page.data_item()
item

In [None]:
# We can get the reference directly
item = pwb.ItemPage(repo, "Q6776811")
item

In [None]:
# And we can get all the pages linked to this reference through WikiData
for k, v in dict(item.sitelinks).items():
    print(k + "\n\t" + v.ns_title())

## Page Views

In [None]:
req = api.Request(site=data_site, parameters={'action': 'query',
                                                'titles': item,
                                                'prop': 'pageviews',
                                                'pvipdays': 1000})

print("As the warning says, the default API request gives us a maximum of", len(req.submit()['query']['pages'][str(item.pageid)]['pageviews']), "days of data. This is insufficient.")

In [None]:
# Set dates
pp_first = datetime.strptime(stime, "%Y%m%d00").strftime("%B %d, %Y").replace(" 0", " ")
pp_today = datetime.today().strftime("%B %d, %Y").replace(" 0", " ")

# Request data
page_name = "Martin_Vetterli"
r = requests.get(pv_url % (page_name, stime, etime), headers=pv_head)
print("Digging deeper into the API, we see we have access to", len(r.json()['items']), "days of pageview data since", pp_first, "as of", pp_today + ".")

## Mentions of EPFL in a page

In [None]:
def pagecounts(page, strings):
    return [page.text.count(s) for s in strings]

In [None]:
pcs = pagecounts(page, epfl_alts)
dict(zip(epfl_alts, pcs))

## Mentions over time

For `getPageChanges`, we recover all revisions at once at it is significantly faster to do that than to call `page.getOldVersion` continuously.

It is important to note that our process is simplified to improve performance by minimizing the number of revisions requested; if a page had 5 mentions in 2008, 5 in 2010, but 4 in 2009, the 4 will be glossed over as we assume the amount of such cases will be rare and insignificant.

Getting the revisions without the content is approximately 7 times faster on large pages (35 vs. 5 seconds), while getting the text from the revisions or from `getOldVersion` takes the same amount of time. However we are still getting faster results for revisions with content when it is part of our functions.

In [None]:
page.revision_count()

start = time.time()
list(page.revisions())
end = time.time()

print(end-start)

start = time.time()
[x for x in page.revisions()]
end = time.time()

print(end-start)

In [None]:
def getMentionCounts(rev, strings):
    return

In [None]:
def getCounts(revs, strings, idx):
    text = revs[idx]
    return sum([text.text.count(s) for s in strings])

def getOrUpdate(revs, strings, counts, idx, changes):
    if idx not in counts:
        temp = getCounts(revs, strings, idx)
        counts[idx] = temp
        
        # Do not consider the count if an earlier revision had more
        if not any([counts[k] > temp for k in counts.keys() if k < idx]):
            changes[temp] = min(changes.get(temp) or idx, idx) 
    
    return counts[idx]

def getMentions(revs_flip, strings, code):
    if not getCounts(revs_flip, strings, 0):
        return None
    
    # Reversing revisions
    revs = revs_flip[::-1]

    # Start with whole scope
    queue = [(0, len(revs) - 1)]
    
    # To avoid double checking revisions we store the counts here
    cnts = {}
    
    # And here we store the count-index pairs
    changes = {}

    while queue:
        # Process first element
        r0, r1 = queue[0]
        queue = queue[1:]

        # Only proceed if current scope covers multiple indices
        if r0 != r1:
            # Get counts for both indices
            v0 = getOrUpdate(revs, strings, cnts, r0, changes)
            v1 = getOrUpdate(revs, strings, cnts, r1, changes)

            # Only proceed if there is a change of count in the current scope
            if v0 != v1 and abs(r1 - r0) > 1:
                mid = (r0 + r1) // 2
                queue.extend([(r0, mid), (mid, r1)])

    changes = {revs[v]["timestamp"]: k for k, v in changes.items()}
    changes = {datetime.combine(k.date(), k.time()): v for k, v in changes.items()}
    
    # Here we simplify our data to a maximum of one point per month (we take the last one)
    changes = pd.Series(changes, name="Mentions").sort_index().groupby(pd.Grouper(freq="1M")).nth(-1)
    return changes

In [None]:
pms = getMentions(page, epfl_alts)
pms

In [None]:
pms.plot(linestyle='--', marker='o')

## TEMPORARY, NEED TO ABSTRACT THE ARRAY CREATION PROCESS 

In [None]:
path = os.path.join("pickles", "en_mentions.pkl")
prev = pd.read_pickle(path)
prev

In [None]:
page.title()

In [None]:
d = {1:2,3:4}

for a, b in d.items():
    print(a, b)

In [None]:
revs = list(page.revisions(reverse=False, content=True))

In [None]:
def subGetRevisions(page, ret, minDate=None):
    # Need to implement minDate functionality
    ret.append(list(page.revisions(reverse=False, content=True)))
    print("Revs")

def subGetMentions(revs, code, strings, ret):
    ret.append(getMentions(revs, strings, code))
    print("Mentions")

def subGetSizes(revs, code, ret):
    ret.append(getSizes(revs, code))
    print("Sizes")

def subGetEdits(revs, code, ret):
    ret.append(getEdits(revs, code))
    print("Edits")

def subGetViews(page, code, ret):
    ret.append(getViews(page, code))
    print("Views")
    
def getSizes(revs, code):
    df = pd.DataFrame([dict(r) for r in revs])
    df = df[["userid", "timestamp", "size"]]
    df = df.set_index("timestamp")
    
    # Get absolute size from relative size
    df["diff"] = (df['size'] - df['size'].shift(1)).abs()
    df["diff"] = df["diff"].fillna(df["size"])
    
    # Sample every month and shift by 1 day to get 1st of month
    se = df["size"].groupby(pd.Grouper(freq="1M")).nth(-1).resample("1M").pad()
    se.index = se.index.shift(1, freq="D")
    
    return se.rename(code)

def getEdits(revs, code):
    df = pd.DataFrame([dict(r) for r in revs])
    df = df.set_index("timestamp")
    df["size"] = 1

    se = df["size"].groupby(pd.Grouper(freq="1M")).count()
    se.index = se.index.shift(-1, freq="M").shift(1, freq="D")
    
    return se.rename(code)

# Need to make it wiki-independant
def getViews(page, code):
    req = requests.get(pv_url % (page.title(), stime, etime), headers=pv_head)
    se = pd.Series({datetime.strptime(str(item['timestamp'])[:-2], "%Y%m%d"): item['views'] for item in req.json()['items']}, name="Views")
    return se.rename(code)

In [None]:
testcol = pd.read_pickle(os.path.join("pickles", "en_mentions.pkl"))["Q50785019"]
testcol

In [None]:
ts1 = testcol.last_valid_index()

In [None]:
ret = []
subGetRevisions(page, ret)

In [None]:
len(ret[0])

In [None]:
revs = ret[0]

In [None]:
ret = []
subGetMentions(revs, "blah", epfl_alts, ret)

In [None]:
ret[0]

In [None]:
ret = []
subGetSizes(revs, "blah", ret)

In [None]:
ret[0]

In [None]:
ret = []
subGetEdits(revs, "blah", ret)

In [None]:
ret[0]

In [None]:
ret = []
subGetViews(page, "blah", ret)

In [None]:
ret[0]

In [None]:
ret = []
t_pszs = threading.Thread(target=subGetSizes, args=(revs, "blah", ret,))
t_pszs.start()
t_pszs.join()
print("Done")

In [None]:
ret

In [None]:
ts2 = ret[0][0]["timestamp"]

In [None]:
print(ts1)
print(ts2)
print(ts1 - ts2)

In [None]:
uts = updateTimeSeries(epfl_pages_sample, epfl_alts)

In [None]:
pd.concat(uts[0], axis=1)

In [None]:
def updateTimeSeries(pages, strings, rescan=False, flush=False):
    '''
    path = None if flush else os.path.join("pickles", "en_mentions.pkl")
    
    try:
        data = pd.read_pickle(path)
    except:
        data = pd.DataFrame()
        
    dfs = [prev]
    
    if rescan:
        pagecodes += list(prev.columns)

    last_ts = data.index[-1]
    '''
    
    # NEW
    prev_code, prev_page = None, None
    prev_revs = None
    
    dfs = [[] for i in range(4)]
    
    fail = []
    
    for curr_code, curr_page in tqdm(pages.items()):
        # Not needed for now : prev["Q7526"].last_valid_index()
        # later we can see about getting only part of the revisions
        # Techniquement la on discard le dernier, faut rajouter un dummy à la fin
        
        pmns, pszs, peds, pvws = [], [], [], []
        curr_revs = []
        
        t_revs = threading.Thread(target=subGetRevisions, args=(curr_page, curr_revs,))
        t_revs.start()

        if prev_code:
            t_pmns = threading.Thread(target=subGetMentions, args=(prev_revs, prev_code, strings, pmns,))
            t_pszs = threading.Thread(target=subGetSizes, args=(prev_revs, prev_code, pszs,))
            t_peds = threading.Thread(target=subGetEdits, args=(prev_revs, prev_code, peds,))
            t_pvws = threading.Thread(target=subGetViews, args=(prev_page, prev_code, pvws,))

            t_pmns.start()
            t_pszs.start()
            t_peds.start()
            t_pvws.start()

            t_pmns.join()
            t_pszs.join()
            t_peds.join()
            t_pvws.join()

        t_revs.join()
        
        prev_code = curr_code
        prev_page = curr_page
        prev_revs = curr_revs[0]

        if prev_code:
            for df_set, col in zip(dfs, [pmns, pszs, peds, pvws]):
                try:
                    df_set.append(col[0])
                except:
                    fail.append((prev_code, col))
            
    return dfs


        
        
    '''
    END OF FUNCTION
        
        # NOW NEED TO PUT ALL ARRAYS TOGETHER

        # Set limit timestamp (or None if no data yet)
        ts = prev[code].last_valid_index() if code in prev.columns else None
        
        
        if df is not None and len(df):
            df.name = code
            df = df.groupby(pd.Grouper(freq="1M")).nth(-1).resample("1M").pad()
            df.index = df.index.shift(1, freq="D")
            
            # Combine with old data if it exists
            if code in prev.columns:
                df = df.combine_first(prev[code])

            dfs.append(df)
    
    curr = pd.concat(dfs, axis=1)
    curr = curr.ffill(axis=0)
    curr.to_pickle(path)
    
    return curr
    '''

In [None]:
epfl_pages[:10]
pcodes = [p.data_item().title() for p in epfl_pages[:1000]]

In [None]:
bulk_mentions = updateMentions(pcodes[500:1000])

In [None]:
bulk_mentions

In [None]:
total_mtns = bulk_mentions.sum(axis=1)
total_mtns.index.name = "Date"
total_mtns.name = "Total mentions"
total_mtns

In [None]:
ax = total_mtns.plot(legend=True, title="Mentions of EPFL on Wikipedia")
ax.set_xlabel('Time')
ax.set_ylabel('Mentions')
ax.figure.set_size_inches((9, 6))
    
ax.figure.savefig(os.path.join(im_path, "mentions.png"))

## NEED TO MAKE THE PCODES FASTER

Par exemple en mémorisant les codes des pages  
Stocker les bulk en sparse matrix avec des valeurs uniquement la ou ca change et pour generer le graphe on fait juste un pulldown  
Sauver régulièrement, mettre un intervalle par défaut  
PARALLELIZE  
Si la reference existe deja, aller avec des previous revisions au lieu d'appeler revisions (faire un test pour voir la différence de performance), on assume que les collectes de données se font régulièrement

## Views of a page over time

Here we compute the pageviews for each day and highlight the outliers.
In the future we will compute outliers based on the local average.

In [None]:
def pageviews(page, stime, etime):
    req = requests.get(pv_url % (page.title(), stime, etime))
    serie = pd.Series({datetime.strptime(str(item['timestamp'])[:-2], "%Y%m%d"): item['views'] for item in req.json()['items']}, name="Views")
    return serie

def outliers(serie):
    return pd.Series(serie[np.abs(stats.zscore(serie)) > 3], name="Outliers")

In [None]:
pvs = pageviews(page, stime, etime)
pvs

In [None]:
ols = outliers(pvs)
ols

In [None]:
ax = pvs.plot(legend=True, title="Daily views for 'Martin Vetterli' on Wikipedia")
ax = ols.plot(legend=True, ax=ax, linestyle="", marker="o")
ax.set_xlabel('Time')
ax.set_ylabel('Views')
ax.figure.set_size_inches((9, 6))

for i, each in enumerate(ols.index):
    y = ols[each]
    ax.text(each + timedelta(25), y, y)
    
ax.figure.savefig(os.path.join(im_path, "views.png"))

With that out of the way, let's generate a list of all the pages that mention EPFL. We set `namespace=0` as this namespace is the one that contains regular pages. Note that searches from Pywikibot are ordered alphabetically by default, unlike what the Wikipedia API does.

In [None]:
pvs2 = pageviews(page, stime, etime)
to_rem = ols.index
pvs2[to_rem] = 0

In [None]:
# Would be good to cover the whole dataset

v = 100
ax = pvs2.plot(legend=True)
(pvs2.rolling(v, center=True).sum() / v).plot(ax=ax, legend=True, label="100-day average", title="Daily views for 'Martin Vetterli' on Wikipedia, 100-day average without outliers")
ax.set_xlabel('Time')
ax.set_ylabel('Views')
ax.figure.set_size_inches((9, 6))
    
ax.figure.savefig(os.path.join(im_path, "views__avg.png"))

## Backlinks

So far I haven't found an efficient way to account for backlinks in page revisions. Therefore this will be skipped for now.

## Edits

In [None]:
temp = list(page.revisions(reverse=True))

In [None]:
def getPageSize(page):
    revs = list(page.revisions(reverse=True))
    df = pd.DataFrame([dict(r) for r in revs])
    df = df[["userid", "timestamp", "size", "minor"]]
    df = df.set_index("timestamp")
    
    # Get absolute size from relative size
    df["diff"] = (df['size'] - df['size'].shift(1)).abs()
    df["diff"] = df["diff"].fillna(df["size"])
    
    # Sample every month and shift by 1 day to get 1st of month
    df = df["size"].groupby(pd.Grouper(freq="1M")).nth(-1).resample("1M").pad()
    df.index = df.index.shift(1, freq="D")
    
    return df.rename(page.data_item().title())

In [None]:
psz = getPageSize(page)
psz

In [None]:
ax = psz.plot()
ax.set_title("Size of Martin Vetterli's page")
ax.set_xlabel("Time")
ax.set_ylabel("Size in Bytes")
ax.figure.savefig(os.path.join(im_path, "mirko.png"))

In [None]:
def getPageScore(page):
    # Get mentions
    pms = getMentions(page, epfl_alts)
    
    if pms is None:
        return None
    
    # Get page views
    pvs = pageviews(page, stime, etime)
    
    # Get page size
    psz = getPageSize(page)
    
    # Combine the data
    df = pd.concat([pvs, psz.reindex(pvs.index), pms.reindex(pvs.index)], axis=1)
    df = df.ffill(axis=0)
    
    s_prev = psz.index.difference(pvs.index)
    s_fill = psz[s_prev[-1]] if len(s_prev) else 0
    df["Size"] = df["Size"].fillna(s_fill)
    
    m_prev = pms.index.difference(pvs.index)
    m_fill = pms[m_prev[-1]] if len(m_prev) else 0
    df["Mentions"] = df["Mentions"].fillna(m_fill)
    
    # Generate score
    df["Score"] = df["Mentions"] * df["Views"] / df["Size"]
    
    return df

In [None]:
psc = getPageScore(page)
psc

In [None]:
psc.plot()

## Editors

We will consider a user that changed their name as a new user for simplicity's sake.  
We do not use the function `page.contributors()` as it makes no distinction between regular and minor edits, and since we're going through the revisions we might as well extract that information in the process.

This can be improved by weighing the edits depending on the size increase of the page.

In [None]:
revs = list(page.revisions(reverse=True, content=True))

In [None]:
# Regular expressions to detect IP addresses and Bots.
pat_ip = re.compile('^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$|([0-9a-fA-F][0-9a-fA-F]{0,3}:){7}([0-9a-fA-F][0-9a-fA-F]{0,3}){1}')
pat_bot = re.compile(r'bot\b', re.IGNORECASE)

# Classify a user by its name into bots, real users, and IPs (unregistered users)
def classify_user(name):
    if pat_ip.match(name):
        return "IP"
    elif pat_bot.search(name):
        return "Bot"
    else:
        return "Real"

# Get user edit data from a page
def users(revs):
    # Get usernames and edit type
    df = pd.DataFrame([(r["user"], r["minor"]) for r in revs])
    
    # Set as index and sort 
    df.index = pd.MultiIndex.from_frame(df)
    df = df[[1]].sort_index(axis=0)
    
    # Group by user and edit type and split into columns
    df = pd.DataFrame(df.groupby(level=[0,1]).size())
    df = df.unstack(level=1, fill_value=0)
    
    # Rename columns and drop useless levels
    df = df.droplevel(level=0, axis=1)
    df.index.name = "Usernames"
    df.columns = ["Major", "Minor"]
    
    # Add user types as first index level
    df.index = pd.MultiIndex.from_tuples([classify_user(i), i] for i in df.index)
    
    return df.sort_index(axis=0)

In [None]:
users(revs)

In [None]:
ueds = users(revs)
ueds["Edits"] = ueds["Minor"] + ueds["Major"]

ax = sns.swarmplot(x=ueds.index.get_level_values(0), y="Edits", data=ueds)
ax.set_title("Editors for 'Martin Vetterli' on Wikipedia")
ax.figure.set_size_inches((9, 6))
    
ax.figure.savefig(os.path.join(im_path, "edits.png"))

In [None]:
def mass_users(pages):
    # We'll want to get the bulk of the data out in the future
    
    temp = []
    
    for p in pages:
        curr = list(p.revisions(reverse=True, content=False))
        udata = users(curr)
        udata[p.title()] = udata["Minor"] + udata["Major"]
        temp.append(udata[p.title()])
    
    return pd.concat(temp, axis=1)

In [None]:
udata = mass_users(epfl_pages[:100])
udata

In [None]:
usr_sum = udata.sum(axis=1)
usr_sum

In [None]:
usr_cnt = udata.count(axis=1)
usr_cnt

In [None]:
fig, ax = plt.subplots()
ax.set_title("Scatterplot of user edits")
ax.set_xlabel("Number of edits")
ax.set_ylabel("Number of pages")
x_vals = np.linspace(0, 100)
ax.plot(x_vals, x_vals)
ax.annotate("y = x", (120, 95))
fig.set_size_inches((9, 6))
artists = []
plot_names = []
for (ns, s), (nc, c) in zip(usr_sum.groupby(level=0), usr_cnt.groupby(level=0)):
    artists.append(plt.scatter(s, c))
    plot_names.append(ns)
ax.legend(artists, plot_names)
ax.figure.savefig(os.path.join(im_path, "users.png"))

In [None]:
for (ns, s), (nc, c) in zip(usr_sum.groupby(level=0), usr_cnt.groupby(level=0)):
    print(s)

# WORK IN PROGRESS

need to make a record of what pages have indeed mentions or not so we can refer to it

In [None]:
def saveData(page):
    ret = getPageScore(page)
    
    if ret is None:
        return None, None
    
    idx = page.data_item().title()
    ret.to_pickle(os.path.join("pickles", idx))
    return idx, ret

def loadData(idx):
    return pd.read_pickle(os.path.join("pickles", idx))

In [None]:
'''

fns = os.listdir("pickles")
for p in epfl_pages:
    try:
        if p.data_item().title() not in fns:
            saveData(p)
            print(p)
    except:
        pass
'''

[[en:Whistleblower]]
WARNING: API warning (result): This result was truncated because it would otherwise be larger than the limit of 12,582,912 bytes.

In [None]:
banane = pwb.Page(wiki_site, u"Fréquence Banane")
banane_s = getPageScore(banane)

In [None]:
banane.data_item().title()

In [None]:
banane_s.to_pickle(banane.data_item().title())

In [None]:
pd.read_pickle(banane.data_item().title())

In [None]:
saveData(banane)

In [None]:
loadData("Q3090425")

In [None]:
a = testdata[0]
b = testdata[1]
pd.concat([a, b], axis=1, keys=["u", "v"]).fillna(0).sum(axis=1, level=1)

In [None]:
fns = [n for n in os.listdir("pickles") if n[0] == "Q"]
testdata = [loadData(fn) for fn in fns]

init = testdata[0]

for td in testdata[1:]:
    init = pd.concat([init, td], axis=1, keys=["l", "r"]).fillna(0).sum(axis=1, level=1)
    
init.plot()

In [None]:
init["Views"].plot()

In [None]:
init["Mentions"].plot()

In [None]:
init["Score"].plot()

## Combining Data

In [None]:
cl = pd.DataFrame(columns=["views", "mentions", "edits"])
cl.loc[495998] = [1, 2, 4]
cl

In [None]:
for i, p in enumerate(epfl_pages[:10]):
    print(i)
    getPageScore(p)

## Editors

In [None]:
for uid, dat in df.groupby("userid"):
    print(uid)
    for val, x in dat.groupby("minor"):
        print(val, sum(x["diff"]))

entry over time
* list of incoming links
* list of outgoing links
* mention history

work with user pages  
potentially use the wikipedia package for smaller operations as it seems to be faster, maybe do some timed tests

count improve counts by checking revisions 5 before and 5 after to ignore edit wars and such

Will need to handle the problem or redirects and page name changes in the future.

Will need to classify whether we found the subject through keywords or if EPFL was mentioned.

We assume mentions are only added for performance and simplicity reasons. It's very rare that content will be removed from pages.

Link pages with contributors

Update data over time instead of recomputing everything

Bot that changes names of EPFL or makes a suggestion on the talk page

orcid, unique identifier for papers

https://go.epfl.ch/wikiproject

correlate growth of epfl mentions and wikipedia in general (or science related pages, lets see what we can do)

In [None]:
[r["size"] for r in revs][-5:]

In [None]:
talkpage = pwb.Page(wiki_site, u"Talk:Martin Vetterli")
talkpage

In [None]:
list(talkpage.categories())

In [None]:
cats = list(page.categories())

In [None]:
cats[0].categoryinfo

In [None]:
# Function that checks that the category records are correct (if a page has been removed from a category, it will put it in the legacy records instead)
def sanitizeCategories():
    pass

In [None]:
epfl_pages[:100]

In [None]:
def extlink_data(page):
    links = [link.split("/", 5)[-1] if "web.archive.org" in link else link for link in page.extlinks()]
    domains = pd.DataFrame([tldextract.extract(link) for link in links], columns=["subdomain", "domain", "suffix"])
    
    try:
        domains["tld"] = domains["suffix"].str.split(".", expand=False).apply(lambda e : e[-1])
        domains["site"] = domains["domain"] + "." + domains["suffix"]
    except:
        pass

    sites = domains["site"].value_counts()
    tlds = domains["tld"].value_counts()
    return page.title(), sites, tlds

In [None]:
titles, sites, tlds = extlink_data(page)
tlds

In [None]:
sites

In [None]:
temp = epfl_pages[:500]

series = np.array([extlink_data(t) for t in temp], dtype=object).transpose()

In [None]:
pd.concat([pd.DataFrame(s, name=t) for t, s in tuple(series[0:2])])

In [None]:
refs = pd.concat(series[1], axis=1, keys=series[0])
refs

In [None]:
ref_sum = refs.sum(axis=1)
ref_sum[ref_sum > 600]

In [None]:
ref_cnt = refs.count(axis=1)
ref_cnt

In [None]:
ax = plt.scatter(ref_sum, ref_cnt).axes
ax.set_title("Scatterplot of domain repartition and frequency")
ax.set_xlabel("Number of occurences")
ax.set_ylabel("Number of pages")
fig = plt.gcf()
fig.set_size_inches((9, 6))
x_vals = np.linspace(0, 300)
ax.plot(x_vals, x_vals)
ax.annotate("y = x", (320, 280))
for t in ["epfl.ch", "unil.ch", "bbc.co.uk", "google.com", "nytimes.com"]:
    ax.scatter(ref_sum[t], ref_cnt[t], color="orange")
    ax.annotate(t, (ref_sum[t]+50, ref_cnt[t]+5), color="orange", bbox=dict(facecolor='white', boxstyle="round,pad=0.3"))
ax.figure.savefig(os.path.join(im_path, "domains.png"))

In [None]:
refs_tld = pd.concat(series[2], axis=1, keys=series[0])
refs_tld

In [None]:
ref_tld_sum = refs_tld.sum(axis=1)
ref_tld_sum

In [None]:
ref_tld_cnt = refs_tld.count(axis=1)
ref_tld_cnt

In [None]:
ax = plt.scatter(ref_tld_sum, ref_tld_cnt).axes
ax.set_title("Scatterplot of top-level-domain repartition and frequency")
ax.set_xlabel("Number of occurences")
ax.set_ylabel("Number of pages")
fig = plt.gcf()
fig.set_size_inches((9, 6))
x_vals = np.linspace(0, 450)
ax.plot(x_vals, x_vals)
ax.annotate("y = x", (720, 400))
for t in ["ch", "org", "com", "gov"]:
    ax.scatter(ref_tld_sum[t], ref_tld_cnt[t], color="orange")
    ax.annotate(t, (ref_tld_sum[t]+400, ref_tld_cnt[t]+1), color="orange", bbox=dict(facecolor='white', boxstyle="round,pad=0.3"))
ax.figure.savefig(os.path.join(im_path, "tlds.png"))

In [None]:
import sys
temp = pd.concat(series[2], axis=1, keys=series[0])
sys.getsizeof(temp)

In [None]:
temp2 = temp.astype(pd.SparseDtype("int", np.nan))
sys.getsizeof(temp2)

In [None]:
pszs = [getPageSize(p) for p in epfl_pages[:10]]

In [None]:
df = pd.concat(pszs, axis=1)
print(sys.getsizeof(df))

print(sys.getsizeof(df.astype(pd.SparseDtype("int", np.nan))))

# Page sizes

In [None]:
# Need to make page to item and item to page functions
# Heavily bottlenecked by the `revisions` function unfortunately

# Gets given keys of all revisions after a given timestamp
def getRevisionsTags(page, ts, keys, content=False):
    gen = page.revisions(reverse=True, content=content)
    
    while True:
        try:
            v = next(gen)
            t = v["timestamp"]
            
            if ts is None or t > ts:
                yield t, [v[key] for key in keys]
            else:
                return
        except StopIteration:
            return

def updatePageSizes(pagecodes, rescan=False):
    path = os.path.join("pickles", "en_page_sizes.pkl")
    
    try:
        prev = pd.read_pickle(path)
    except:
        prev = pd.DataFrame()
        
    dfs = [prev]
    
    if rescan:
        pagecodes += list(prev.columns)
    
    for code in tqdm(pagecodes):
        try:
            # Recover page from code
            p = pwb.Page(wiki_site, pwb.ItemPage(repo, code).sitelinks["enwiki"].ns_title())
        except:
            continue

        # Set limit timestamp (or None if no data yet)
        ts = prev[code].last_valid_index() if code in prev.columns else None
        
        # Get values after that timestamp
        values = [(t, v[0]) for t, v in getRevisionsTags(p, ts, ["size"])]
        
        if len(values):
            # Sample every month and shift by 1 day to get 1st of month
            df = pd.DataFrame(values, columns=[0, code]).set_index(0)
            df = df.groupby(pd.Grouper(freq="1M")).nth(-1).resample("1M").pad()
            df.index = df.index.shift(1, freq="D")
            
            # Combine with old data if it exists
            if code in prev.columns:
                df = df.combine_first(prev[code])
                

            dfs.append(df)
    
    curr = pd.concat(dfs, axis=1)
    curr = curr.ffill(axis=0)
    curr.to_pickle(path)
    
    return curr

In [None]:
pcodes = [p.data_item().title() for p in epfl_pages[:300]]
pszs = updatePageSizes(pcodes)

In [None]:
pszs = pd.read_pickle("pickles/en_page_sizes.pkl")

In [None]:
pszs

In [None]:
total_size = pszs.sum(axis=1)
total_size.index.name = "Date"
total_size.name = "Select Pages Size"
total_size

In [None]:
total_size.plot()

# Page count

Here we look at how many of our pages exist at a given time

In [None]:
def pageCounts(df):
    ret = df.count(axis=1)
    ret.index.name = "Date"
    ret.name = "Select Pages Count"
    return ret

In [None]:
pcnts = pageCounts(pszs)
pcnts

In [None]:
pcnts.plot()

# Page size and page counts ratio

In [None]:
(total_size / pcnts).plot()

In [None]:
def loadWikistats(fn, name, exp=0):
    path = os.path.join("csv", "wikistats", fn)
    
    with open(path, "rb") as f:
        df = pd.read_csv(f)
        
    df.index = pd.to_datetime(df["month"], format="%Y-%m-%dT%H:%M:%S.%fZ").rename("Date")
    df = df["total.total"].rename(name) * 10**exp
        
    return df

In [None]:
wiki_size = loadWikistats("size_change_en.csv", name="English Wikipedia Size", exp=0)
wiki_size = wiki_size[17:].cumsum()
wiki_size

In [None]:
wiki_cnts = loadWikistats("pages_en.csv", name="English Wikipedia Count", exp=0)
wiki_cnts

In [None]:
wiki_ratio = pd.concat([wiki_size, wiki_cnts], axis=1).dropna()
wiki_ratio = wiki_ratio.iloc[:,0] / wiki_ratio.iloc[:,1]
wiki_ratio.name = "English Wikipedia Ratio"
wiki_ratio.plot()
wiki_ratio

In [None]:
pages_ratio = pd.concat([total_size, pcnts], axis=1).dropna()
pages_ratio = pages_ratio.iloc[:,0] / pages_ratio.iloc[:,1]
pages_ratio.name = "Select Pages Ratio"
pages_ratio.plot()
pages_ratio

In [None]:
ax = pd.concat([total_size], axis=1).plot(logy=False, figsize=(9, 6))
ax.set_title("Size in Bytes")
ax.figure.savefig(os.path.join(im_path, "size_comp_lin.png"))

In [None]:
ax = pd.concat([total_size, wiki_size], axis=1).plot(logy=True, figsize=(9, 6))
ax.set_title("Size in Bytes (log scale)")
ax.figure.savefig(os.path.join(im_path, "size_comp.png"))

In [None]:
ax = pd.concat([pcnts, wiki_cnts], axis=1).plot(logy=True, figsize=(9, 6))
ax.set_title("Page counts (log scale)")
ax.figure.savefig(os.path.join(im_path, "count_comp.png"))

In [None]:
ax = pd.concat([pages_ratio, wiki_ratio], axis=1).plot(logy=True, figsize=(9, 6))
ax.set_title("Size / Page Count Ratio (log scale)")
ax.figure.savefig(os.path.join(im_path, "ratio_comp.png"))

# Performance comparison

In [None]:
import time
start = time.time()
page.revisions()
end = time.time()
print(end-start)

In [None]:
print(epfl_pages[100].title())

In [None]:
# Can only run on one page at a time
start = time.time()
revs = next(iter(api.PropertyGenerator('revisions', site=wiki_site, parameters={
    'titles': 'Martin Vetterli',
    'rvprop': 'timestamp|size',
})))['revisions']
end = time.time()
print(end-start)

In [None]:
revs

In [None]:
# Parallelize queries to the API

In [None]:
mention_data = []
for p in epfl_pages[:100]:
    mention_data.append(getMentions(p, epfl_alts))

In [None]:
mention_data