In [94]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Train

In [95]:
import findspark
try:
    findspark.init()
except:
    PYSPARK_PATH = '../spark/spark-2.4.3-bin-hadoop2.7/' # change path to yours
    findspark.init(PYSPARK_PATH)
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import UserDefinedFunction

In [96]:
import os
from glob import glob
import numpy as np
import pandas as pd

import sys
stages = ["1_data_collection", "2_feature_engineering", "3_modeling", "4_evaluation"]
for stage in stages:
    sys.path.insert(0, stage)

from xml_to_csv import process_dumps
from functions import filter_columns, extract_features

## Configure for sample data

In [46]:
DATA_DIR = "sample_data"
DATE = "20190701"
XML_DIR = os.path.join(DATA_DIR, "xml")
CSV_DIR = os.path.join(DATA_DIR, "csv")

DUMP_BASE_URL = "https://dumps.wikimedia.org/enwiki/{}".format(DATE)

dump_names = ["enwiki-20190701-pages-articles-multistream14.xml-p7697599p7744799"]
dump_ext = ".bz2"

## Collect data

In [52]:
!rm $DATA_DIR/xml/* 2> null
for dump_name in dump_names:
    print("Loading {}...".format(dump_name))
    !wget -P $DATA_DIR/xml/ $DUMP_BASE_URL/$dump_name$dump_ext 2> /dev/null
    !bzip2 -d $DATA_DIR/xml/$dump_name$dump_ext 2> /dev/null

Loading:
	> enwiki-20190701-pages-articles-multistream14.xml-p7697599p7744799...


In [53]:
print('Parsing XML + Fetching ORES...')
process_dumps(XML_DIR, CSV_DIR, jupyter=True)
!rm $DATA_DIR/xml/* 2> null
print('Collected wiki dump(s) with ORES in {}/csv'.format(DATA_DIR))

Parsing XML + Fetching ORES...
XML Files found: enwiki-20190701-pages-articles-multistream14.xml-p7697599p7744799
Processing enwiki-20190701-pages-articles-multistream14.xml-p7697599p7744799


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

Alert: Exception fetching ORES score for revisions. Ex: 'score'
Alert: Exception fetching ORES score for revisions. Ex: 'score'
Wiki dump(s) with ORES collected in sample_data/csv


## Extract features

In [120]:
csv_raw = glob(os.path.join(CSV_DIR, "enwiki-{}-pages-articles-multistream*_raw.csv".format(DATE)))
csv_feature = os.path.join(CSV_DIR, "enwiki-{}-features.csv".format(DATE))

df = spark.read.csv(csv_raw, inferSchema=True, header=True, multiLine=True, escape='"')

df_features = extract_features(df)

print("Size of the DataFrame: {} records".format(df.count()))
df_features.printSchema()

df_features.toPandas().to_csv(csv_feature)

Size of the DataFrame: 12484 records
root
 |-- Stub: double (nullable = true)
 |-- Start: double (nullable = true)
 |-- C: double (nullable = true)
 |-- B: double (nullable = true)
 |-- GA: double (nullable = true)
 |-- FA: double (nullable = true)
 |-- n_words: double (nullable = false)
 |-- n_internal_links: double (nullable = false)
 |-- n_external_links: double (nullable = false)
 |-- level2: double (nullable = false)
 |-- level3: double (nullable = false)
 |-- level4: double (nullable = false)
 |-- level5: double (nullable = false)
 |-- level6: double (nullable = false)
 |-- book_citations: double (nullable = false)
 |-- journal_citations: double (nullable = false)
 |-- n_paragraphs: double (nullable = false)
 |-- n_unreferenced: double (nullable = false)
 |-- n_categories: double (nullable = false)
 |-- n_images: double (nullable = false)



# Test