In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark.sql import *
from pyspark.sql.functions import col, lower, regexp_replace, split, size

### EDA + Processing

In [140]:
# df_path = "../csv_data/top-10-sample-output.csv"
df_path = "../csv_data/sample-output.csv"
df = spark.read.csv(df_path, inferSchema=True, header=True, multiLine=True, escape='"')
pdf = pd.read_csv(df_path, index_col=0)

In [143]:
"""
Columns filtering
    Useful: sha1 (as identifier),  timestamp, title, text
    Questionable: user, comment, ip, id (there are different articles with the same id), parentid, restrictions
    Not useful (no unique info): model, format, ns, contributor, revision, restrictions
""" 

print("All columns:", df.columns)
print("Unique values for..")
for column in ["format", "model", "ns", "contributor", "revision", "restrictions"]:
    print("\t", column, ":", pdf[column].unique())
    
useful_columns = ["sha1", "timestamp", "title", "text"]
print("Useful columns:", useful_columns)

clean_df = df[useful_columns]
clean_pdf = pdf[useful_columns]

All columns: ['_c0', 'comment', 'contributor', 'format', 'id', 'ip', 'model', 'ns', 'parentid', 'restrictions', 'revision', 'sha1', 'text', 'timestamp', 'title', 'username']
Unique values for..
	 format : ['text/x-wiki']
	 model : ['wikitext']
	 ns : [  0   4 100  12]
	 contributor : ['  ' ' ']
	 revision : ['         ' '          ' '        ']
	 restrictions : [nan 'move=:edit=' 'move=sysop' 'edit=autoconfirmed:move=autoconfirmed'
 'sysop' 'edit=sysop:move=sysop']
Useful columns: ['sha1', 'timestamp', 'title', 'text']


In [18]:
clean_df.show(10)
clean_df[["text"]].show(1, truncate=False)

+--------------------+-------------------+--------------------+--------------------+
|                sha1|          timestamp|               title|                text|
+--------------------+-------------------+--------------------+--------------------+
|42l0cvblwtb4nnupx...|2018-08-14 09:47:24| AccessibleComputing|#REDIRECT [[Compu...|
|2d0jpq2oi6jjc6hbb...|2019-06-16 03:28:20|           Anarchism|{{redirect2|Anarc...|
|iv7s0lr40b17x33tf...|2017-06-05 07:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|
|39r4w8qg62iexlysk...|2017-06-05 07:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|
|fncm9bh9l25bmvyzq...|2017-06-05 07:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|
|q8gdi8070w6yitd4h...|2017-06-05 07:19:45|AfghanistanCommun...|#REDIRECT [[Commu...|
|miah0hk4ws6ctake8...|2017-06-05 00:42:11|AfghanistanTransp...|#REDIRECT [[Trans...|
|j013t2shx5j3p2gq4...|2017-06-05 00:43:11| AfghanistanMilitary|#REDIRECT [[Afgha...|
|80xx3tzgvcdioufir...|2017-06-05 00:43:14|AfghanistanTransn...|#R

In [279]:
clean_df.printSchema()
print("Size of the DataFrame: {} records".format(clean_df.count()))

root
 |-- sha1: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)

Size of the DataFrame: 19821 records


In [147]:
"""
Headings counting
Syntaxis:
    ==Level 2==
    ===Level 3===
    ====Level 4====
    =====Level 5=====
    ======Level 6======
"""

def single_head_level_count(text, level):
    assert level in range(2,7)
    pattern = "=" * level
    pattern = pattern + "[a-zA-Z0-9.,!? ]+" + pattern
    return size(split(text, pattern=pattern))-1

def count_headings(df):
    return reduce(
        lambda df, level: df.withColumn("level{}".format(level),
                                        single_head_level_count(col("text"), level)),
        range(2,7), df)
    
count_headings(clean_df).show(20)

+--------------------+-------------------+--------------------+--------------------+------+------+------+------+------+
|                sha1|          timestamp|               title|                text|level2|level3|level4|level5|level6|
+--------------------+-------------------+--------------------+--------------------+------+------+------+------+------+
|42l0cvblwtb4nnupx...|2018-08-14 09:47:24| AccessibleComputing|#REDIRECT [[Compu...|     0|     0|     0|     0|     0|
|2d0jpq2oi6jjc6hbb...|2019-06-16 03:28:20|           Anarchism|{{redirect2|Anarc...|    29|    16|     3|     0|     0|
|iv7s0lr40b17x33tf...|2017-06-05 07:18:18|  AfghanistanHistory|#REDIRECT [[Histo...|     0|     0|     0|     0|     0|
|39r4w8qg62iexlysk...|2017-06-05 07:18:23|AfghanistanGeography|#REDIRECT [[Geogr...|     0|     0|     0|     0|     0|
|fncm9bh9l25bmvyzq...|2017-06-05 07:19:42|   AfghanistanPeople|#REDIRECT [[Demog...|     0|     0|     0|     0|     0|
|q8gdi8070w6yitd4h...|2017-06-05 07:19:4

### below is WIP

In [141]:
"""
Wikipedia syntax
Font:
- ''italics''
- '''bold'''
- '''''both'''''
- <s>stroke</s>
- <u>underlined</u>
- <!--comments-->

Images:
    [[File: | thumb  | upright | right | alt= | caption ]]
    Example: [[File:Wiki.png|thumb|Caption]]

Internal Links:
    [[A]] -- internal reference to an article titled A
    [[A|B]] -- internal reference to an article titled A (written as B)
    [[A#C|B]] -- internal reference to a section C of an article titled A (written as B)
External Links:
    https://www.google.com -- simple link
    [https://www.google.com] -- link (reference)
    [https://www.google.com A] -- reference written as A
    <ref name="B">[https://www.google.com A]</ref> -- reference A written as B, can be referenced again like:
    <ref name="B" /> -- reference to the source B
    <ref>Lots of words</ref> -- reference without a link
    {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- external reference
    Example:
        {{sfnm|1a1=McLaughlin|1y=2007|1p=59|2a1=Flint|2y=2009|2p=27}} -- McLaughlin 2007, p. 59; Flint 2009, p. 27.
        {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- Craig 2005, p. 14; Sheehan 2003, p. 85.

{{reflist}} -- list of references
{{cn}} -- citation needed

[[Category:Category name]]
[[:Category:Category name]]
[[:File:File name]]
"""

In [None]:
"""
Cite Web:
    <ref>{{cite web
    |url= 
    |title= 
    |author= 
    |date= 
    |work= 
    |publisher= 
    |accessdate=
    }}</ref>

Cite Journal:
    <ref>{{cite journal
    |last1= 
    |first1=
    |last2=
    |first2=
    |year= 
    |title=
    |journal= 
    |volume= 
    |issue= 
    |pages= 
    |publisher= 
    |doi= 
    |url=
    |accessdate= }}</ref>
    
Cite Book (Short):
    <ref>{{cite book
    |last = 
    |first = 
    |authorlink = 
    |title = 
    |publisher = 
    |series =  
    |year =  
    |doi = 
    |isbn = 
    }}</ref>

Cite Book (Extended):
    <ref>{{cite book
    | last       = 
    | first      = 
    | authorlink = 
    | coauthors  = 
    | editor        = 
    | title         = 
    | trans_title   = 
    | url           = 
    | accessdate    = 
    | edition   = 
    | series    = 
    | volume    = 
    | date      = 
    | year      = 
    | publisher = 
    | location  = 
    | isbn      = 
    | doi       = 
    | page      = 
    | pages     = 
    | chapter   = 
    }}</ref>
    
"""

In [None]:
"""
We need to remove:
    {{Outdent|...}}
    {{convert|...}}
"""

In [127]:
source_df

DataFrame[name: string, eye_color: string]

In [117]:
from functools import reduce
source_df = spark.createDataFrame(
    [
        ("Jose", "BLUE"),
        ("lI", "BrOwN")
    ],
    ["name", "eye_color"]
)

actual_df = (reduce(
    lambda df, col_name: df.withColumn(col_name, lower(col(col_name))),
    source_df.columns,
    source_df
))

print(source_df.show())
print(actual_df.show())

+----+---------+
|name|eye_color|
+----+---------+
|Jose|     BLUE|
|  lI|    BrOwN|
+----+---------+

None
+----+---------+
|name|eye_color|
+----+---------+
|jose|     blue|
|  li|    brown|
+----+---------+

None


In [14]:
def get_feature_from_text(df):
    return df.select(split(df.s, '[0-9]+').alias('s'))

# df = spark.createDataFrame([('ab12cd',)], ['s',])
get_feature_from_text(df).show()
# df.show()

+--------+
|       s|
+--------+
|[ab, cd]|
+--------+



In [381]:
def clean_text(c):
#     c = lower(c)
#     c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
#     c = split(c, "\\n+")
    c = split(c, "====")
    return size(c)

# clean_df.select(clean_text(col("text")).alias("num_paragraphs")).show(4, truncate=False)
# 
# clean_df.printSchema()
# clean_df.select("text").map()#clean_text).show(10)
# clean_df.withColumn("test", clean_df.text[0]).select("test").show(5)

+--------------+
|num_paragraphs|
+--------------+
|1             |
|13            |
|1             |
|1             |
+--------------+
only showing top 4 rows



In [235]:
def get_features_from_text(text):
    textFile = spark.read.text(text)
#     features = pd.Series()
#     features["redirect"] = text[:9] == "#REDIRECT"
#     return features
    return textFile

print(list(pdf.columns))


# for row in df.iterrows():
for i in range(1,5):
    row = pdf.loc[i]
    print("\n\nARTICLE", i, "|", row["title"])
#     print("> Comment:", row["comment"])
#     print("> User:", row["username"])
#     print(row["ns"])
    print("> Text:\n", "\n".join(row["text"].split('\n')[:5]))
#     print("> Features:\n", get_features_from_text(row["text"]))

['comment', 'contributor', 'format', 'id', 'ip', 'model', 'ns', 'parentid', 'restrictions', 'revision', 'sha1', 'text', 'timestamp', 'title', 'username']


ARTICLE 1 | Anarchism
> Text:
 {{redirect2|Anarchist|Anarchists|other uses|Anarchists (disambiguation)}}
{{pp-move-indef}}{{short description|Political philosophy that advocates self-governed societies}}
{{use dmy dates|date=July 2018}}
{{use British English|date=January 2014}}
{{anarchism sidebar}}


ARTICLE 2 | AfghanistanHistory
> Text:
 #REDIRECT [[History of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}


ARTICLE 3 | AfghanistanGeography
> Text:
 #REDIRECT [[Geography of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}


ARTICLE 4 | AfghanistanPeople
> Text:
 #REDIRECT [[Demographics of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}


### Clustering

In [11]:
# from pyspark.ml.clustering import BisectingKMeans

# # Loads data.
# dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")

# # Trains a bisecting k-means model.
# bkm = BisectingKMeans().setK(2).setSeed(1)
# model = bkm.fit(dataset)

# # Evaluate clustering.
# cost = model.computeCost(dataset)
# print("Within Set Sum of Squared Errors = " + str(cost))

# # Shows the result.
# print("Cluster Centers: ")
# centers = model.clusterCenters()
# for center in centers:
#     print(center)

In [None]:
spark.stop()