In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark.sql import *
from pyspark.sql.functions import col, lower, regexp_replace, split, size

In [5]:
!ls ..

csv_data	     notebooks	requirements.txt  xml_to_csv.ipynb
load_sample_data.sh  readme.md	xml_data


### EDA + Processing

In [6]:
# df_path = "../csv_data/top-10-sample-output.csv"
df_path = "../csv_data/sample-output.csv"
df = spark.read.csv(df_path, inferSchema=True, header=True, multiLine=True, escape='"')
pdf = pd.read_csv(df_path, index_col=0)

In [9]:
df.head()

Row(_c0=0, comment='remove from category for seeking instructions on rcats', contributor='  ', format='text/x-wiki', id=23257138, ip=None, model='wikitext', ns=0, parentid=834079434, restrictions=None, revision='         ', sha1='42l0cvblwtb4nnupxm6wo000d27t6kf', text='#REDIRECT [[Computer accessibility]]\n\n{{R from move}}\n{{R from CamelCase}}\n{{R unprintworthy}}', timestamp=datetime.datetime(2018, 8, 14, 6, 47, 24), title='AccessibleComputing', username='Godsy')

In [10]:
"""
Columns filtering
    Useful: sha1 (as identifier),  timestamp, title, text
    Questionable: user, comment, ip, id (there are different articles with the same id), parentid, restrictions
    Not useful (no unique info): model, format, ns, contributor, revision, restrictions
""" 

print("All columns:", df.columns)
print("Unique values for..")
for column in ["format", "model", "ns", "contributor", "revision", "restrictions"]:
    print("\t", column, ":", pdf[column].unique())
    
useful_columns = ["sha1", "timestamp", "title", "text"]
print("Useful columns:", useful_columns)

clean_df = df[useful_columns]
clean_pdf = pdf[useful_columns]

All columns: ['_c0', 'comment', 'contributor', 'format', 'id', 'ip', 'model', 'ns', 'parentid', 'restrictions', 'revision', 'sha1', 'text', 'timestamp', 'title', 'username']
Unique values for..
	 format : ['text/x-wiki']
	 model : ['wikitext']
	 ns : [  0   4 100  12]
	 contributor : ['  ' ' ']
	 revision : ['         ' '          ' '        ']
	 restrictions : [nan 'move=:edit=' 'move=sysop' 'edit=autoconfirmed:move=autoconfirmed'
 'sysop' 'edit=sysop:move=sysop']
Useful columns: ['sha1', 'timestamp', 'title', 'text']


In [11]:
clean_df.show(10)
clean_df[["text"]].show(1, truncate=False)

+--------------------+-------------------+--------------------+--------------------+
|                sha1|          timestamp|               title|                text|
+--------------------+-------------------+--------------------+--------------------+
|42l0cvblwtb4nnupx...|2018-08-14 06:47:24| AccessibleComputing|#REDIRECT [[Compu...|
|2d0jpq2oi6jjc6hbb...|2019-06-16 00:28:20|           Anarchism|{{redirect2|Anarc...|
|iv7s0lr40b17x33tf...|2017-06-05 04:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|
|39r4w8qg62iexlysk...|2017-06-05 04:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|
|fncm9bh9l25bmvyzq...|2017-06-05 04:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|
|q8gdi8070w6yitd4h...|2017-06-05 04:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|
|miah0hk4ws6ctake8...|2017-06-04 21:42:11|AfghanistanTransp...|#REDIRECT [[Trans...|
|j013t2shx5j3p2gq4...|2017-06-04 21:43:11| AfghanistanMilitary|#REDIRECT [[Afgha...|
|80xx3tzgvcdioufir...|2017-06-04 21:43:14|AfghanistanTransn...|#R

In [12]:
clean_df.printSchema()
print("Size of the DataFrame: {} records".format(clean_df.count()))

root
 |-- sha1: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)

Size of the DataFrame: 19821 records


In [28]:
clean_df.show()

+--------------------+-------------------+--------------------+--------------------+
|                sha1|          timestamp|               title|                text|
+--------------------+-------------------+--------------------+--------------------+
|42l0cvblwtb4nnupx...|2018-08-14 06:47:24| AccessibleComputing|#REDIRECT [[Compu...|
|2d0jpq2oi6jjc6hbb...|2019-06-16 00:28:20|           Anarchism|{{redirect2|Anarc...|
|iv7s0lr40b17x33tf...|2017-06-05 04:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|
|39r4w8qg62iexlysk...|2017-06-05 04:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|
|fncm9bh9l25bmvyzq...|2017-06-05 04:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|
|q8gdi8070w6yitd4h...|2017-06-05 04:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|
|miah0hk4ws6ctake8...|2017-06-04 21:42:11|AfghanistanTransp...|#REDIRECT [[Trans...|
|j013t2shx5j3p2gq4...|2017-06-04 21:43:11| AfghanistanMilitary|#REDIRECT [[Afgha...|
|80xx3tzgvcdioufir...|2017-06-04 21:43:14|AfghanistanTransn...|#R

In [35]:
# list(pdf.head()['text'])

In [36]:
'''Internal Links:
    [[A]] -- internal reference to an article titled A
    [[A|B]] -- internal reference to an article titled A (written as 😎
    [[A#C|B]] -- internal reference to a section C of an article titled A (written as B)'''



def count_internal_links(df):
    pattern = "\[\[[a-zA-Z0-9.,!? ]+\]\]"
    pattern += "|\[\[[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]\]"
    pattern += "|\[\[[a-zA-Z0-9.,!? ]+#[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]\]"
    
    return df.withColumn("n_internal_links", size(split(col('text'), pattern=pattern))-1 )

count_internal_links(clean_df).show()

+--------------------+-------------------+--------------------+--------------------+----------------+
|                sha1|          timestamp|               title|                text|n_internal_links|
+--------------------+-------------------+--------------------+--------------------+----------------+
|42l0cvblwtb4nnupx...|2018-08-14 06:47:24| AccessibleComputing|#REDIRECT [[Compu...|               1|
|2d0jpq2oi6jjc6hbb...|2019-06-16 00:28:20|           Anarchism|{{redirect2|Anarc...|             389|
|iv7s0lr40b17x33tf...|2017-06-05 04:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|               1|
|39r4w8qg62iexlysk...|2017-06-05 04:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|               1|
|fncm9bh9l25bmvyzq...|2017-06-05 04:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|               1|
|q8gdi8070w6yitd4h...|2017-06-05 04:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|               1|
|miah0hk4ws6ctake8...|2017-06-04 21:42:11|AfghanistanTransp...|#REDIRECT [[Trans..

In [77]:
'''
    [[File: | thumb  | upright | right | alt= | caption ]]
'''

def count_of_images(df):
    any_text = "[a-zA-Z0-9.,!? ]+ \] "
    pattern = "\[[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]"
    return df.withColumn("count_images", size(split(col('text'), pattern=pattern))-1)

count_of_images(clean_df).show()


+--------------------+-------------------+--------------------+--------------------+------------+
|                sha1|          timestamp|               title|                text|count_images|
+--------------------+-------------------+--------------------+--------------------+------------+
|42l0cvblwtb4nnupx...|2018-08-14 06:47:24| AccessibleComputing|#REDIRECT [[Compu...|           0|
|2d0jpq2oi6jjc6hbb...|2019-06-16 00:28:20|           Anarchism|{{redirect2|Anarc...|           0|
|iv7s0lr40b17x33tf...|2017-06-05 04:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|           0|
|39r4w8qg62iexlysk...|2017-06-05 04:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|           0|
|fncm9bh9l25bmvyzq...|2017-06-05 04:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|           0|
|q8gdi8070w6yitd4h...|2017-06-05 04:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|           0|
|miah0hk4ws6ctake8...|2017-06-04 21:42:11|AfghanistanTransp...|#REDIRECT [[Trans...|           0|
|j013t2shx5j3p2gq4..

In [98]:
result = count_of_images(clean_df)


In [105]:
from pyspark.sql import functions as f
result.select(f.sum('count_images')).show()

+-----------------+
|sum(count_images)|
+-----------------+
|                0|
+-----------------+



In [107]:
def words_counts(df):
    return df.withColumn('words_count', f.size(f.split(f.col('text'), ' ')))

words_counts(clean_df).show()

+--------------------+-------------------+--------------------+--------------------+-----------+
|                sha1|          timestamp|               title|                text|words_count|
+--------------------+-------------------+--------------------+--------------------+-----------+
|42l0cvblwtb4nnupx...|2018-08-14 06:47:24| AccessibleComputing|#REDIRECT [[Compu...|          8|
|2d0jpq2oi6jjc6hbb...|2019-06-16 00:28:20|           Anarchism|{{redirect2|Anarc...|      10120|
|iv7s0lr40b17x33tf...|2017-06-05 04:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|          8|
|39r4w8qg62iexlysk...|2017-06-05 04:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|          8|
|fncm9bh9l25bmvyzq...|2017-06-05 04:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|          8|
|q8gdi8070w6yitd4h...|2017-06-05 04:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|          8|
|miah0hk4ws6ctake8...|2017-06-04 21:42:11|AfghanistanTransp...|#REDIRECT [[Trans...|         10|
|j013t2shx5j3p2gq4...|2017-06-

In [124]:
def unique_words_counts(df):

    return df.withColumn('words_count', f.size(f.split(f.col('text'), ' ')) )

unique_words_counts(clean_df).show()

+--------------------+-------------------+--------------------+--------------------+-----------+
|                sha1|          timestamp|               title|                text|words_count|
+--------------------+-------------------+--------------------+--------------------+-----------+
|42l0cvblwtb4nnupx...|2018-08-14 06:47:24| AccessibleComputing|#REDIRECT [[Compu...|          8|
|2d0jpq2oi6jjc6hbb...|2019-06-16 00:28:20|           Anarchism|{{redirect2|Anarc...|      10120|
|iv7s0lr40b17x33tf...|2017-06-05 04:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|          8|
|39r4w8qg62iexlysk...|2017-06-05 04:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|          8|
|fncm9bh9l25bmvyzq...|2017-06-05 04:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|          8|
|q8gdi8070w6yitd4h...|2017-06-05 04:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|          8|
|miah0hk4ws6ctake8...|2017-06-04 21:42:11|AfghanistanTransp...|#REDIRECT [[Trans...|         10|
|j013t2shx5j3p2gq4...|2017-06-

In [97]:
# result.xagg({'*': 'count', 'Age': 'avg', 'Fare':'sum'}).show()

In [78]:
# import re
# pattern = r"\[[a-zA-Z0-9.,!? ]+\]"
# # result = re.match(pattern, '[[File: | thumb  | upright | right | alt= | caption ]]')
# result = re.match(pattern, '[File:]')
# print(result)

In [79]:
# # type(count_of_images(clean_df))
# from pyspark.sql import functions as F
# count_of_images(clean_df).select(F.sum('count_images')).collect()[0][0]

In [57]:
# type(clean_df)

In [29]:
def get_feature_from_text(df):
    return df.select(split(df.s, '[0-9]+').alias('text'))

# df = spark.createDataFrame([('ab12cd',)], ['s',])
get_feature_from_text(clean_df).show()

AttributeError: 'DataFrame' object has no attribute 's'

In [14]:
from functools import reduce

"""
Headings counting
Syntaxis:
    ==Level 2==
    ===Level 3===
    ====Level 4====
    =====Level 5=====
    ======Level 6======
"""

def single_head_level_count(text, level):
    assert level in range(2,7)
    pattern = "=" * level
    pattern = pattern + "[a-zA-Z0-9.,!? ]+" + pattern
    return size(split(text, pattern=pattern))-1

def count_headings(df):
    return reduce(
        lambda df, level: df.withColumn("level{}".format(level),
                                        single_head_level_count(col("text"), level)),
        range(2,7), df)
    
count_headings(clean_df).show(20)

+--------------------+-------------------+--------------------+--------------------+------+------+------+------+------+
|                sha1|          timestamp|               title|                text|level2|level3|level4|level5|level6|
+--------------------+-------------------+--------------------+--------------------+------+------+------+------+------+
|42l0cvblwtb4nnupx...|2018-08-14 06:47:24| AccessibleComputing|#REDIRECT [[Compu...|     0|     0|     0|     0|     0|
|2d0jpq2oi6jjc6hbb...|2019-06-16 00:28:20|           Anarchism|{{redirect2|Anarc...|    29|    16|     3|     0|     0|
|iv7s0lr40b17x33tf...|2017-06-05 04:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|     0|     0|     0|     0|     0|
|39r4w8qg62iexlysk...|2017-06-05 04:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|     0|     0|     0|     0|     0|
|fncm9bh9l25bmvyzq...|2017-06-05 04:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|     0|     0|     0|     0|     0|
|q8gdi8070w6yitd4h...|2017-06-05 04:19:4

### below is WIP

In [15]:
"""
Wikipedia syntax
Font:
- ''italics''
- '''bold'''
- '''''both'''''
- <s>stroke</s>
- <u>underlined</u>
- <!--comments-->

Images:
    [[File: | thumb  | upright | right | alt= | caption ]]
    Example: [[File:Wiki.png|thumb|Caption]]

Internal Links:
    [[A]] -- internal reference to an article titled A
    [[A|B]] -- internal reference to an article titled A (written as B)
    [[A#C|B]] -- internal reference to a section C of an article titled A (written as B)
External Links:
    https://www.google.com -- simple link
    [https://www.google.com] -- link (reference)
    [https://www.google.com A] -- reference written as A
    <ref name="B">[https://www.google.com A]</ref> -- reference A written as B, can be referenced again like:
    <ref name="B" /> -- reference to the source B
    <ref>Lots of words</ref> -- reference without a link
    {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- external reference
    Example:
        {{sfnm|1a1=McLaughlin|1y=2007|1p=59|2a1=Flint|2y=2009|2p=27}} -- McLaughlin 2007, p. 59; Flint 2009, p. 27.
        {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- Craig 2005, p. 14; Sheehan 2003, p. 85.

{{reflist}} -- list of references
{{cn}} -- citation needed

[[Category:Category name]]
[[:Category:Category name]]
[[:File:File name]]
"""

'\nWikipedia syntax\nFont:\n- \'\'italics\'\'\n- \'\'\'bold\'\'\'\n- \'\'\'\'\'both\'\'\'\'\'\n- <s>stroke</s>\n- <u>underlined</u>\n- <!--comments-->\n\nImages:\n    [[File: | thumb  | upright | right | alt= | caption ]]\n    Example: [[File:Wiki.png|thumb|Caption]]\n\nInternal Links:\n    [[A]] -- internal reference to an article titled A\n    [[A|B]] -- internal reference to an article titled A (written as B)\n    [[A#C|B]] -- internal reference to a section C of an article titled A (written as B)\nExternal Links:\n    https://www.google.com -- simple link\n    [https://www.google.com] -- link (reference)\n    [https://www.google.com A] -- reference written as A\n    <ref name="B">[https://www.google.com A]</ref> -- reference A written as B, can be referenced again like:\n    <ref name="B" /> -- reference to the source B\n    <ref>Lots of words</ref> -- reference without a link\n    {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- external reference\n    Example:\n     

In [16]:
"""
Cite Web:
    <ref>{{cite web
    |url= 
    |title= 
    |author= 
    |date= 
    |work= 
    |publisher= 
    |accessdate=
    }}</ref>

Cite Journal:
    <ref>{{cite journal
    |last1= 
    |first1=
    |last2=
    |first2=
    |year= 
    |title=
    |journal= 
    |volume= 
    |issue= 
    |pages= 
    |publisher= 
    |doi= 
    |url=
    |accessdate= }}</ref>
    
Cite Book (Short):
    <ref>{{cite book
    |last = 
    |first = 
    |authorlink = 
    |title = 
    |publisher = 
    |series =  
    |year =  
    |doi = 
    |isbn = 
    }}</ref>

Cite Book (Extended):
    <ref>{{cite book
    | last       = 
    | first      = 
    | authorlink = 
    | coauthors  = 
    | editor        = 
    | title         = 
    | trans_title   = 
    | url           = 
    | accessdate    = 
    | edition   = 
    | series    = 
    | volume    = 
    | date      = 
    | year      = 
    | publisher = 
    | location  = 
    | isbn      = 
    | doi       = 
    | page      = 
    | pages     = 
    | chapter   = 
    }}</ref>
    
"""

'\nCite Web:\n    <ref>{{cite web\n    |url= \n    |title= \n    |author= \n    |date= \n    |work= \n    |publisher= \n    |accessdate=\n    }}</ref>\n\nCite Journal:\n    <ref>{{cite journal\n    |last1= \n    |first1=\n    |last2=\n    |first2=\n    |year= \n    |title=\n    |journal= \n    |volume= \n    |issue= \n    |pages= \n    |publisher= \n    |doi= \n    |url=\n    |accessdate= }}</ref>\n    \nCite Book (Short):\n    <ref>{{cite book\n    |last = \n    |first = \n    |authorlink = \n    |title = \n    |publisher = \n    |series =  \n    |year =  \n    |doi = \n    |isbn = \n    }}</ref>\n\nCite Book (Extended):\n    <ref>{{cite book\n    | last       = \n    | first      = \n    | authorlink = \n    | coauthors  = \n    | editor        = \n    | title         = \n    | trans_title   = \n    | url           = \n    | accessdate    = \n    | edition   = \n    | series    = \n    | volume    = \n    | date      = \n    | year      = \n    | publisher = \n    | location  = \n   

In [None]:
"""
We need to remove:
    {{Outdent|...}}
    {{convert|...}}
"""

In [19]:
source_df

DataFrame[name: string, eye_color: string]

In [18]:

source_df = spark.createDataFrame(
    [
        ("Jose", "BLUE"),
        ("lI", "BrOwN")
    ],
    ["name", "eye_color"]
)

actual_df = (reduce(
    lambda df, col_name: df.withColumn(col_name, lower(col(col_name))),
    source_df.columns,
    source_df
))

print(source_df.show())
print(actual_df.show())

+----+---------+
|name|eye_color|
+----+---------+
|Jose|     BLUE|
|  lI|    BrOwN|
+----+---------+

None
+----+---------+
|name|eye_color|
+----+---------+
|jose|     blue|
|  li|    brown|
+----+---------+

None


In [21]:
df

DataFrame[_c0: int, comment: string, contributor: string, format: string, id: int, ip: string, model: string, ns: int, parentid: int, restrictions: string, revision: string, sha1: string, text: string, timestamp: timestamp, title: string, username: string]

In [20]:
def get_feature_from_text(df):
    return df.select(split(df.s, '[0-9]+').alias('s'))

# df = spark.createDataFrame([('ab12cd',)], ['s',])
get_feature_from_text(df).show()
# df.show()

AttributeError: 'DataFrame' object has no attribute 's'

In [22]:
def clean_text(c):
#     c = lower(c)
#     c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
#     c = split(c, "\\n+")
    c = split(c, "====")
    return size(c)

# clean_df.select(clean_text(col("text")).alias("num_paragraphs")).show(4, truncate=False)
# 
# clean_df.printSchema()
# clean_df.select("text").map()#clean_text).show(10)
# clean_df.withColumn("test", clean_df.text[0]).select("test").show(5)

In [23]:
def get_features_from_text(text):
    textFile = spark.read.text(text)
#     features = pd.Series()
#     features["redirect"] = text[:9] == "#REDIRECT"
#     return features
    return textFile

print(list(pdf.columns))


# for row in df.iterrows():
for i in range(1,5):
    row = pdf.loc[i]
    print("\n\nARTICLE", i, "|", row["title"])
#     print("> Comment:", row["comment"])
#     print("> User:", row["username"])
#     print(row["ns"])
    print("> Text:\n", "\n".join(row["text"].split('\n')[:5]))
#     print("> Features:\n", get_features_from_text(row["text"]))

['comment', 'contributor', 'format', 'id', 'ip', 'model', 'ns', 'parentid', 'restrictions', 'revision', 'sha1', 'text', 'timestamp', 'title', 'username']


ARTICLE 1 | Anarchism
> Text:
 {{redirect2|Anarchist|Anarchists|other uses|Anarchists (disambiguation)}}
{{pp-move-indef}}{{short description|Political philosophy that advocates self-governed societies}}
{{use dmy dates|date=July 2018}}
{{use British English|date=January 2014}}
{{anarchism sidebar}}


ARTICLE 2 | AfghanistanHistory
> Text:
 #REDIRECT [[History of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}


ARTICLE 3 | AfghanistanGeography
> Text:
 #REDIRECT [[Geography of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}


ARTICLE 4 | AfghanistanPeople
> Text:
 #REDIRECT [[Demographics of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}


### Clustering

In [11]:
# from pyspark.ml.clustering import BisectingKMeans

# # Loads data.
# dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")

# # Trains a bisecting k-means model.
# bkm = BisectingKMeans().setK(2).setSeed(1)
# model = bkm.fit(dataset)

# # Evaluate clustering.
# cost = model.computeCost(dataset)
# print("Within Set Sum of Squared Errors = " + str(cost))

# # Shows the result.
# print("Cluster Centers: ")
# centers = model.clusterCenters()
# for center in centers:
#     print(center)

In [None]:
spark.stop()