In [3]:
import os
import numpy as np

import findspark
try:
    findspark.init()
except:
    PYSPARK_PATH = '../spark/spark-2.4.3-bin-hadoop2.7/' # change path to yours
    findspark.init(PYSPARK_PATH)
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql import *
from pyspark.sql.functions import col, lower, regexp_replace, split, size, UserDefinedFunction
from pyspark.sql.types import StringType, IntegerType
from functools import reduce
import re

In [5]:
df_path = "csv_data/top-10-sample-output-ores.csv"
# df_path = "csv_data/sample-output-ores.csv"
df_out_path = "{}_features.csv".format(df_path[:-4])
df = spark.read.csv(df_path, inferSchema=True, header=True, multiLine=True, escape='"')

In [6]:
"""
Columns filtering
    Useful: sha1 (as identifier),  timestamp, title, text
    Questionable: user, comment, ip, id (there are different articles with the same id), parentid, restrictions
    Not useful (no unique info): model, format, ns, contributor, revision, restrictions
""" 

print("All columns:", df.columns)
print("Unique values for..")
for column in ["format", "model", "ns", "contributor", "revision", "restrictions"]:
    print("\t", column, ":", df.select(column).distinct().rdd.map(lambda r: r[0]).collect())
    
ores_weights = {'Stub': 1, 'Start': 2, 'C': 3, 'B': 4, 'GA': 5, 'FA': 6}
ores_scores = list(ores_weights.keys())
useful_columns = ["sha1", "timestamp", "title", "text"] + ores_scores
print("Useful columns:", useful_columns)

All columns: ['_c0', 'Unnamed: 0.1', 'comment', 'contributor', 'format', 'id', 'ip', 'model', 'ns', 'parentid', 'restrictions', 'revision', 'sha1', 'text', 'timestamp', 'title', 'username', 'revid', 'B', 'C', 'FA', 'GA', 'Start', 'Stub']
Unique values for..
	 format : ['text/x-wiki']
	 model : ['wikitext']
	 ns : [0]
	 contributor : ['  ']
	 revision : ['         ', '          ']
	 restrictions : [None]
Useful columns: ['sha1', 'timestamp', 'title', 'text', 'Stub', 'Start', 'C', 'B', 'GA', 'FA']


In [7]:
clean_df = df[useful_columns]
clean_df.printSchema()
print("Size of the DataFrame: {} records".format(clean_df.count()))

root
 |-- sha1: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- Stub: double (nullable = true)
 |-- Start: double (nullable = true)
 |-- C: double (nullable = true)
 |-- B: double (nullable = true)
 |-- GA: double (nullable = true)
 |-- FA: double (nullable = true)

Size of the DataFrame: 10 records


In [8]:
df_features = clean_df

In [9]:
def words_counts(df):
    return df.withColumn('n_words', size(split(col('text'), ' ')))

df_features = words_counts(df_features)

In [10]:
"""
Headings counting
Syntaxis:
    ==Level 2==
    ===Level 3===
    ====Level 4====
    =====Level 5=====
    ======Level 6======
"""

def single_head_level_count(text, level):
    assert level in range(2,7)
    pattern = "=" * level
    pattern = pattern + "[a-zA-Z0-9.,!? ]+" + pattern
    return size(split(text, pattern=pattern))-1

def count_headings(df):
    return reduce(
        lambda df, level: df.withColumn("level{}".format(level),
                                        single_head_level_count(col("text"), level)),
        range(2,7), df)
    
df_features = count_headings(df_features)

In [166]:
"""
Citation counting
Syntaxis:
    {{cite {book}(.*?)}}
    {{cite {journal}(.*?)}}
"""

def citation_counter(citation_source):
    def _count_citations(text):
        matches = re.findall(f"{{cite {citation_source}(.*?)}}", text, re.IGNORECASE)
        return len(matches)
    return _count_citations

book_citations_count = UserDefinedFunction(citation_counter("book"), IntegerType())
journal_citations_count = UserDefinedFunction(citation_counter("journal"), IntegerType())

df_features = df_features.withColumn("book_citations", book_citations_count("text"))\
  .withColumn("journal_citations", journal_citations_count("text"))

In [167]:
'''Internal Links:
    [[A]] -- internal reference to an article titled A
    [[A|B]] -- internal reference to an article titled A (written as B)
    [[A#C|B]] -- internal reference to a section C of an article titled A (written as B)'''

def count_internal_links(df):
    
    pattern = "\[\[[a-zA-Z0-9.,!? ]+\]\]"
    pattern += "|\[\[[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]\]"
    pattern += "|\[\[[a-zA-Z0-9.,!? ]+#[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]\]"
    
    return df.withColumn("n_internal_links",
                         size(split(col('text'), pattern=pattern))-1)

df_features = count_internal_links(df_features)

In [168]:
'''External Links:
    https://www.google.com -- simple link
    [https://www.google.com] -- link (reference)
    [https://www.google.com A] -- reference written as A
    <ref name="B">[https://www.google.com A]</ref> -- reference A written as B, can be referenced again like:
    <ref name="B" /> -- reference to the source B
    <ref>Lots of words</ref> -- reference without a link
    {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- external reference
    Example:
        {{sfnm|1a1=McLaughlin|1y=2007|1p=59|2a1=Flint|2y=2009|2p=27}} -- McLaughlin 2007, p. 59; Flint 2009, p. 27.
        {{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} -- Craig 2005, p. 14; Sheehan 2003, p. 85.'''

def count_external_links(df):
    
    pattern = 'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
    pattern += '|\[https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+\]'
    pattern += '|\[https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+\ [a-zA-Z0-9.,!? ]+]'
    pattern += '<ref name="[a-zA-Z0-9.,!? ]+">\[https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+\]'
    
    # template of the external reference
    #template = '\{\{sfnm\|1a1=[a-zA-Z]+\|1y=[0-9]+\|1p=[0-9]+\|2a1=[a-zA-Z]+\|2y=[0-9]+\|2p=[0-9]+\}\}'
    
    # <ref name="B" /> - this form use information from other reference, so we didn't count it again
    # <ref>Lots of words</ref> - reference without a link will be in the other feature
    
    return df.withColumn("n_external_links",
                         size(split(col('text'), pattern=pattern))-1)

df_features = count_external_links(df_features)

In [169]:
"""
Paragraphs
"""

def count_paragraphs(df):
    
    # filter the basic wikipedia syntaxis
    pattern_filtering = '\n\n\{\{.*\}\}\n\n|\n\n\[\[.*\]\]\n\n|\n\n={1,7}.*={1,7}\n\n'
    # split by two enters
    pattern_splitting = '\n\n'

    return df.withColumn('n_paragraphs', size(split(regexp_replace(col('text'), 
                                                                   pattern_filtering, ''), 
                                                    pattern_splitting))-1)

df_features = count_paragraphs(df_features)

In [170]:
'''<ref>Lots of words</ref> -- reference without a link
{{cn}} -- citation needed'''

def count_unreferenced(df):
    
    # citation needed and references without link
    pattern = '\{\{cn\}\}|<ref>[a-zA-Z0-9.,!? ]+</ref>'
    
    return df.withColumn('n_unreferenced', size(split(col('text'), pattern))-1)

df_features = count_unreferenced(df_features)

In [171]:
'''[[Category:Category name]]
[[:Category:Category name]]
[[:File:File name]]'''

def count_categories(df):
    
    #using template
    pattern = '\[\[:?Category:[a-zA-Z0-9.,\-!?\(\) ]+\]\]'
    
    return df.withColumn('n_categories', size(split(col('text'), pattern))-1)

df_features = count_categories(df_features)

In [172]:
'''
    [[File: | thumb  | upright | right | alt= | caption ]]
'''

def count_of_images(df):
    any_text = "[a-zA-Z0-9.,!? ]+ \] "
    pattern = "\[[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]"
    return df.withColumn("n_images", size(split(col('text'), pattern=pattern))-1)

df_features = count_of_images(df_features)

In [20]:
features_names = ['title',
                  'Stub',
                  'Start',
                  'C',
                  'B',
                  'GA',
                  'FA',
                  'n_words',
                  'level2',
                  'level3',
                  'level4',
                  'level5',
                  'level6',
                  'book_citations',
                  'journal_citations',
                  'n_internal_links',
                  'n_external_links',
                  'n_paragraphs',
                  'n_unreferenced',
                  'n_categories',
                  'n_images'
                 ]

df_features = df_features.select(list(map(lambda x: df_features[x].cast('double') if x != 'title' else df_features[x], 
                                          features_names)))

In [21]:
df_features.printSchema()

root
 |-- title: string (nullable = true)
 |-- Stub: double (nullable = true)
 |-- Start: double (nullable = true)
 |-- C: double (nullable = true)
 |-- B: double (nullable = true)
 |-- GA: double (nullable = true)
 |-- FA: double (nullable = true)
 |-- n_words: double (nullable = false)
 |-- level2: double (nullable = false)
 |-- level3: double (nullable = false)
 |-- level4: double (nullable = false)
 |-- level5: double (nullable = false)
 |-- level6: double (nullable = false)
 |-- book_citations: double (nullable = true)
 |-- journal_citations: double (nullable = true)
 |-- n_internal_links: double (nullable = false)
 |-- n_external_links: double (nullable = false)
 |-- n_paragraphs: double (nullable = false)
 |-- n_unreferenced: double (nullable = false)
 |-- n_categories: double (nullable = false)
 |-- n_images: double (nullable = false)



In [23]:
for feature in features_names:
    df_features = df_features.filter(df_features[feature].isNotNull())

In [24]:
df_features.toPandas().to_csv(df_out_path)