In [1]:
import numpy as np
import pandas as pd

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql import *
from pyspark.sql.functions import col, lower, regexp_replace, split, size

### EDA + Processing

In [140]:
# df_path = "../csv_data/top-10-sample-output.csv"
df_path = "../csv_data/sample-output.csv"
df = spark.read.csv(df_path, inferSchema=True, header=True, multiLine=True, escape='"')
pdf = pd.read_csv(df_path, index_col=0)

In [143]:
"""
Columns filtering
    Useful: sha1 (as identifier),  timestamp, title, text
    Questionable: user, comment, ip, id (there are different articles with the same id), parentid, restrictions
    Not useful (no unique info): model, format, ns, contributor, revision, restrictions
""" 

print("All columns:", df.columns)
print("Unique values for..")
for column in ["format", "model", "ns", "contributor", "revision", "restrictions"]:
    print("\t", column, ":", pdf[column].unique())
    
useful_columns = ["sha1", "timestamp", "title", "text"]
print("Useful columns:", useful_columns)

clean_df = df[useful_columns]
clean_pdf = pdf[useful_columns]

All columns: ['_c0', 'comment', 'contributor', 'format', 'id', 'ip', 'model', 'ns', 'parentid', 'restrictions', 'revision', 'sha1', 'text', 'timestamp', 'title', 'username']
Unique values for..
	 format : ['text/x-wiki']
	 model : ['wikitext']
	 ns : [  0   4 100  12]
	 contributor : ['  ' ' ']
	 revision : ['         ' '          ' '        ']
	 restrictions : [nan 'move=:edit=' 'move=sysop' 'edit=autoconfirmed:move=autoconfirmed'
 'sysop' 'edit=sysop:move=sysop']
Useful columns: ['sha1', 'timestamp', 'title', 'text']


In [160]:
clean_df.printSchema()
clean_df.show(10)
print("Size of the DataFrame: {} records".format(clean_df.count()))

root
 |-- sha1: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)

+--------------------+-------------------+--------------------+--------------------+
|                sha1|          timestamp|               title|                text|
+--------------------+-------------------+--------------------+--------------------+
|42l0cvblwtb4nnupx...|2018-08-14 09:47:24| AccessibleComputing|#REDIRECT [[Compu...|
|2d0jpq2oi6jjc6hbb...|2019-06-16 03:28:20|           Anarchism|{{redirect2|Anarc...|
|iv7s0lr40b17x33tf...|2017-06-05 07:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|
|39r4w8qg62iexlysk...|2017-06-05 07:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|
|fncm9bh9l25bmvyzq...|2017-06-05 07:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|
|q8gdi8070w6yitd4h...|2017-06-05 07:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|
|miah0hk4ws6ctake8...|2017-06-05 00:42:11|AfghanistanTransp...|#REDIRECT [[T

In [256]:
"""
Headings counting
Syntaxis:
    ==Level 2==
    ===Level 3===
    ====Level 4====
    =====Level 5=====
    ======Level 6======
"""

def single_head_level_count(text, level):
    assert level in range(2,7)
    pattern = "=" * level
    pattern = pattern + "[a-zA-Z0-9.,!? ]+" + pattern
    return size(split(text, pattern=pattern))-1

def count_headings(df):
    return reduce(
        lambda df, level: df.withColumn("level{}".format(level),
                                        single_head_level_count(col("text"), level)),
        range(2,7), df)
    
df_features = count_headings(clean_df)
df_features.show(20)

+--------------------+-------------------+--------------------+--------------------+------+------+------+------+------+
|                sha1|          timestamp|               title|                text|level2|level3|level4|level5|level6|
+--------------------+-------------------+--------------------+--------------------+------+------+------+------+------+
|42l0cvblwtb4nnupx...|2018-08-14 09:47:24| AccessibleComputing|#REDIRECT [[Compu...|     0|     0|     0|     0|     0|
|2d0jpq2oi6jjc6hbb...|2019-06-16 03:28:20|           Anarchism|{{redirect2|Anarc...|    29|    16|     3|     0|     0|
|iv7s0lr40b17x33tf...|2017-06-05 07:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|     0|     0|     0|     0|     0|
|39r4w8qg62iexlysk...|2017-06-05 07:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|     0|     0|     0|     0|     0|
|fncm9bh9l25bmvyzq...|2017-06-05 07:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|     0|     0|     0|     0|     0|
|q8gdi8070w6yitd4h...|2017-06-05 07:19:4

### below is WIP (mostly trash)

In [141]:
"""
Wikipedia syntax
Font:
- ''italics''
- '''bold'''
- '''''both'''''
- <s>stroke</s>
- <u>underlined</u>
- <!--comments-->

Images:
    [[File: | thumb  | upright | right | alt= | caption ]]
    Example: [[File:Wiki.png|thumb|Caption]]

Internal Links:
    [[A]] -- internal reference to an article titled A
    [[A|B]] -- internal reference to an article titled A (written as B)
    [[A#C|B]] -- internal reference to a section C of an article titled A (written as B)
External Links:
    https://www.google.com -- simple link
    [https://www.google.com] -- link (reference)
    [https://www.google.com A] -- reference written as A
    <ref name="B">[https://www.google.com A]</ref> -- reference A written as B, can be referenced again like:
    <ref name="B" /> -- reference to the source B
    <ref>Lots of words</ref> -- reference without a link
    {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- external reference
    Example:
        {{sfnm|1a1=McLaughlin|1y=2007|1p=59|2a1=Flint|2y=2009|2p=27}} -- McLaughlin 2007, p. 59; Flint 2009, p. 27.
        {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- Craig 2005, p. 14; Sheehan 2003, p. 85.

{{reflist}} -- list of references
{{cn}} -- citation needed

[[Category:Category name]]
[[:Category:Category name]]
[[:File:File name]]
"""

In [None]:
"""
Cite Web:
    <ref>{{cite web
    |url= 
    |title= 
    |author= 
    |date= 
    |work= 
    |publisher= 
    |accessdate=
    }}</ref>

Cite Journal:
    <ref>{{cite journal
    |last1= 
    |first1=
    |last2=
    |first2=
    |year= 
    |title=
    |journal= 
    |volume= 
    |issue= 
    |pages= 
    |publisher= 
    |doi= 
    |url=
    |accessdate= }}</ref>
    
Cite Book (Short):
    <ref>{{cite book
    |last = 
    |first = 
    |authorlink = 
    |title = 
    |publisher = 
    |series =  
    |year =  
    |doi = 
    |isbn = 
    }}</ref>

Cite Book (Extended):
    <ref>{{cite book
    | last       = 
    | first      = 
    | authorlink = 
    | coauthors  = 
    | editor        = 
    | title         = 
    | trans_title   = 
    | url           = 
    | accessdate    = 
    | edition   = 
    | series    = 
    | volume    = 
    | date      = 
    | year      = 
    | publisher = 
    | location  = 
    | isbn      = 
    | doi       = 
    | page      = 
    | pages     = 
    | chapter   = 
    }}</ref>
    
"""

In [None]:
spark.stop()

### Clustering

In [11]:
# from pyspark.ml.clustering import BisectingKMeans

# # Loads data.
# dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")

# # Trains a bisecting k-means model.
# bkm = BisectingKMeans().setK(2).setSeed(1)
# model = bkm.fit(dataset)

# # Evaluate clustering.
# cost = model.computeCost(dataset)
# print("Within Set Sum of Squared Errors = " + str(cost))

# # Shows the result.
# print("Cluster Centers: ")
# centers = model.clusterCenters()
# for center in centers:
#     print(center)

In [257]:
# import mwapi
# import mwtypes
# import requests
# from datetime import datetime as dt

# session = mwapi.Session("https://en.wikipedia.org")#, user_agent="diego@wikimedia.org -- Tools demo")

# LOCAL_TIMEDELTA = dt.now().astimezone().utcoffset().total_seconds()

# WEIGHTS = {'Stub': 1, 'Start': 2, 'C': 3, 'B': 4, 'GA': 5, 'FA': 6}

# def score2sum(score_doc):
#     weighted_sum = 0
#     for cl, proba in score_doc['probability'].items():
#         weighted_sum += WEIGHTS[cl] * proba
#     return weighted_sum

# def fetch_wp10_score(rev_id):
#     response = requests.get('https://ores.wikimedia.org/v3/scores/enwiki/{0}/wp10'.format(rev_id))
#     return response.json()['enwiki']['scores'][str(rev_id)]['wp10']['score']

# def get_revision(page_name, timestamp, timedelta=LOCAL_TIMEDELTA):
#     for response_doc in session.get(action='query', prop='revisions', titles=page_name,
#                                     rvprop=['ids', 'timestamp'], rvlimit=100, rvdir="newer", 
#                                     formatversion=2, continuation=True):
# #         print(response_doc)
#         rev_docs = response_doc['query']['pages'][0]['revisions']
#         rev_docs = pd.DataFrame(rev_docs)[['revid', 'timestamp']]
#         rev_docs['time'] = pd.to_datetime(rev_docs.timestamp, \
#                                           format='%Y-%m-%dT%H:%M:%SZ', \
#                                           errors='ignore') + pd.Timedelta(seconds=timedelta)
#         rev_id = rev_docs[rev_docs.time == timestamp]["revid"]
#         if len(rev_id) == 1:
#             rev_id = rev_id.values[0]
#             score = fetch_wp10_score(rev_id)
#             weighted_sum = score2sum(score)
#         #     return rev_id, score, weighted_sum
#             return weighted_sum
#         return None

# get_revision("AccessibleComputing", "2018-08-14 09:47:24")

# # get_revision("AccessibleComputing", "2018-08-14 09:47:24")
# # fetch_historical_scores("AccessibleComputing")


# def test(page_name):
#     session.get(action='query', prop='revisions', titles=page_name,
#                                     rvprop=['ids', 'timestamp'], rvlimit=100, rvdir="newer", 
#                                     formatversion=2, continuation=True)
#     return page_name

# def fetch_ores(df):
#     return df.withColumn("ores", test(col("title")))

# fetch_ores(clean_df).show(1)
# # count_headings(clean_df).show(20)