# GOALS

* Decide on viewcount threshold to eliminate views?
* Get feature columns working
* Build an LDA model

In [1]:
## testing printing output from console
import subprocess
cmd = [ 'echo', '"Welcome to my PySpark analysis of some StackExchange Data"' ]
output = subprocess.Popen( cmd, stdout=subprocess.PIPE ).communicate()[0]
print(output)

b'"Welcome to my PySpark analysis of some StackExchange Data"\n'


In [2]:
import time
import numpy as np
import pandas as pd
from datetime import datetime

# Load PySpark and Data Structures

In [3]:
%run -i '1-load-pyspark-and-structs.py'

The Spark UI is available at: http://192.168.0.26:4040/ and the defaultParallelism is 4
The data_array is: ['english', 'math', 'rus_stackoverflow', 'stackoverflow', 'superuser']


# Load Data

In [4]:
%%time
print(datetime.now().time())
#%run -i '2-load-datasets.py'

16:08:13.119420
CPU times: user 199 µs, sys: 77 µs, total: 276 µs
Wall time: 232 µs


In [5]:
def show_spark_df(df, n=5):
    '''
    function to better print spark df entries
    '''
    display(pd.DataFrame(df.head(n), columns=df.columns))

# Clean Data

In [6]:
%%time
print(datetime.now().time())
#%run -i '3-clean-datasets.py'

16:08:13.147381
CPU times: user 415 µs, sys: 114 µs, total: 529 µs
Wall time: 498 µs


# EDA (optional)

In [7]:
%%time
print(datetime.now().time())
#%run -i '4-eda.py'

#NB TO DO: Find threshold to delete low views to make sure users that can vote have seen the question

'''
vc_thresh_data = {}

## finding means of viewcounts across fora

for i in data_array:
    vc_thresh_data[i] = datasets[i].select("viewcount").rdd.flatMap(lambda x: x).mean()

vc_thresh_data'''

16:08:13.157070
CPU times: user 186 µs, sys: 68 µs, total: 254 µs
Wall time: 211 µs


# Define Ravi Target Variable and Export Clean Data

In [8]:
%%time
print(f'\033[94m{datetime.now().time()}\033[0m')
#%run -i '5-define-target.py'

[94m16:08:13.172947[0m
CPU times: user 427 µs, sys: 166 µs, total: 593 µs
Wall time: 590 µs


# Load Clean Data

In [9]:
%%time

## load clean data
for i in data_array:
    datasets[i] = (
        spark
        .read
        .load(f'clean-data/{i}.parquet')
)

CPU times: user 5.54 ms, sys: 2.9 ms, total: 8.44 ms
Wall time: 7.74 s


In [None]:
.sort(col("count").desc()

# Create Results Dictionary

In [10]:
RESULTS = {}

# Silly Mean and Median Models

In [11]:
from pyspark.sql.functions import array, lit, struct

## create median and mean dictionaries
y_ravi_medians = {}
y_ravi_means = {}

## create constant median and mean columns for evaluation baseline
for i in data_array:
    y_ravi_medians[i] = datasets[i].approxQuantile('y_ravi', [0.5], 0.25)[0]
    y_ravi_means[i] = datasets[i].select('y_ravi').rdd.flatMap(lambda x: x).mean()

In [12]:
## import rmse evaluator
from pyspark.ml.evaluation import RegressionEvaluator

## create baselines dictionary
baselines = {}

for i in data_array:

    ## initial variable for timing
    t0 = time.time()
    
    ## train silly median model
    datasets[i] = datasets[i].withColumn('median_pred', lit(y_ravi_medians[i]))

    ## evaluate silly median model
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="median_pred")
    baselines[i] = evaluator.evaluate(datasets[i])

    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m median\033[0m model is {baselines[i]}")

    ## record time taken
    timet = round( time.time() - t0, 0 )
    
    ## store as dictionary inside RESULTS dictionary, initiating dataset name entries first
    RESULTS[i] = {'silly_median': [baselines[i], 0, timet]}

The root-mean-square error of [94menglish's[0m[92m median[0m model is 0.01143449130368263
The root-mean-square error of [94mmath's[0m[92m median[0m model is 0.02066314611782247
The root-mean-square error of [94mrus_stackoverflow's[0m[92m median[0m model is 0.01892184255451773
The root-mean-square error of [94mstackoverflow's[0m[92m median[0m model is 0.014222320246117
The root-mean-square error of [94msuperuser's[0m[92m median[0m model is 0.010638377607808124


In [13]:
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## train silly mean model
    datasets[i] = datasets[i].withColumn('mean_pred', lit(y_ravi_means[i]))

    ## evaluate silly mean model
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="mean_pred")
    rmse = evaluator.evaluate(datasets[i])

    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m mean\033[0m model is {rmse}")

    ## calculate improvement over median baseline
    imprvt = round( (rmse/baselines[i] - 1)*-100, 3 )
    
    ## record time taken
    timet = round( time.time() - t0, 0 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i]['silly_mean'] = [rmse, imprvt, timet]

The root-mean-square error of [94menglish's[0m[92m mean[0m model is 0.010981530956766525
The root-mean-square error of [94mmath's[0m[92m mean[0m model is 0.019399287092347818
The root-mean-square error of [94mrus_stackoverflow's[0m[92m mean[0m model is 0.017635677029879747
The root-mean-square error of [94mstackoverflow's[0m[92m mean[0m model is 0.014056075967462648
The root-mean-square error of [94msuperuser's[0m[92m mean[0m model is 0.010293770490763297


In [14]:
# improvement using rmse
pd.DataFrame.from_dict(RESULTS).T

Unnamed: 0,silly_mean,silly_median
english,"[0.010981530956766525, 3.961, 1.0]","[0.01143449130368263, 0, 2.0]"
math,"[0.019399287092347818, 6.116, 1.0]","[0.02066314611782247, 0, 1.0]"
rus_stackoverflow,"[0.017635677029879747, 6.797, 1.0]","[0.01892184255451773, 0, 1.0]"
stackoverflow,"[0.014056075967462648, 1.169, 1.0]","[0.014222320246117, 0, 0.0]"
superuser,"[0.010293770490763297, 3.239, 1.0]","[0.010638377607808124, 0, 1.0]"


In [15]:
'''
Interesting that some models have a vast improvement just using a constant mean prediction compare to median.
'''

'\nInteresting that some models have a vast improvement just using a constant mean prediction compare to median.\n'

# Viewcount Model

In [16]:
from pyspark.ml.pipeline import Pipeline

from pyspark.ml.feature import VectorAssembler

## because you can't just pass a column without vectorising it
to_vectors = VectorAssembler(inputCols=["viewcount"], outputCol="features")

processing_pipeline = Pipeline(stages=[to_vectors])

In [17]:
%%time

## linear regression on just viewcount
from pyspark.ml.regression import LinearRegression
    
lr = LinearRegression(#maxIter=100, # this doesn't change anything
                      #regParam=0.3, # using regularisation parameter here useless since there is one feature
                      #elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol="y_ravi",
                      predictionCol="viewcount_pred")

## make final pipeline
final_pipeline = Pipeline(stages=[to_vectors, lr])

CPU times: user 3.18 ms, sys: 1.99 ms, total: 5.16 ms
Wall time: 50.1 ms


In [18]:
%%time

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="viewcount_pred")

for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    rmse = evaluator.evaluate(final_pipeline.fit(datasets[i]).transform(datasets[i]))
        
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m viewcount\033[0m model is {rmse}")

    ## calculate improvement over median baseline
    imprvt = round((rmse/baselines[i] - 1)*-100, 3)
    
    ## record time taken
    timet = round( time.time() - t0, 0 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i]['viewcount'] = [rmse, imprvt, timet]

The root-mean-square error of [94menglish's[0m[92m viewcount[0m model is 0.010953713840166027
The root-mean-square error of [94mmath's[0m[92m viewcount[0m model is 0.019351893913727568
The root-mean-square error of [94mrus_stackoverflow's[0m[92m viewcount[0m model is 0.01759095636614909
The root-mean-square error of [94mstackoverflow's[0m[92m viewcount[0m model is 0.014054933648709552
The root-mean-square error of [94msuperuser's[0m[92m viewcount[0m model is 0.010283246941089102
CPU times: user 233 ms, sys: 67.3 ms, total: 300 ms
Wall time: 49.6 s


In [19]:
# improvement using rmse
pd.DataFrame.from_dict(RESULTS).T

Unnamed: 0,silly_mean,silly_median,viewcount
english,"[0.010981530956766525, 3.961, 1.0]","[0.01143449130368263, 0, 2.0]","[0.010953713840166027, 4.205, 7.0]"
math,"[0.019399287092347818, 6.116, 1.0]","[0.02066314611782247, 0, 1.0]","[0.019351893913727568, 6.346, 17.0]"
rus_stackoverflow,"[0.017635677029879747, 6.797, 1.0]","[0.01892184255451773, 0, 1.0]","[0.01759095636614909, 7.034, 9.0]"
stackoverflow,"[0.014056075967462648, 1.169, 1.0]","[0.014222320246117, 0, 0.0]","[0.014054933648709552, 1.177, 9.0]"
superuser,"[0.010293770490763297, 3.239, 1.0]","[0.010638377607808124, 0, 1.0]","[0.010283246941089102, 3.338, 8.0]"


In [22]:
show_spark_df(datasets['stackoverflow'])

Unnamed: 0,title,viewcount,score,clean_date,clean_body,y_ravi,median_pred,mean_pred
0,Rails: updating a record using ujs,624,0,,I have a simple app that is using Ajax with a...,0.0,0.0,0.002168
1,trying to access a deep value with bracket not...,114,0,,I'm trying to record the position of an objec...,0.0,0.0,0.002168
2,Is there a collection for storing discrete int...,108,6,,"I need to store discrete ranges in a set, joi...",0.055556,0.0,0.002168
3,SignalR - Detect disconnected client in no time,1312,1,,I am working on an application that needs to ...,0.000762,0.0,0.002168
4,Conditional Statement logged in user,36,0,,I want to create a conditional statement ...,0.0,0.0,0.002168


# Begin Feature Engineering

In [None]:
## garbage collector to speed up computation
import gc
collected = gc.collect()
print("Garbage collector: collected %d objects." % collected)

In [None]:
target = "y_ravi"
indep_text_variables = ["title", "clean_body"]
#.drop('age').collect()

In [None]:
## import elements from natural language toolkit
import nltk
#nltk.download('all') # uncomment after first run as admin check
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
lmtzr = WordNetLemmatizer()

def get_tokens(line):
    '''
    Function to parse text features
    '''
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuations from each word
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stopwords
    words = [w for w in words if not w in stop_words]
    # lemmatizing the words, see https://en.wikipedia.org/wiki/Lemmatisation
    '''lemmatise or stem???'''
    words = [lmtzr.lemmatize(w) for w in words]
    # remove single letters
    words = [word for word in words if not len(word)==1]
    return (words)

In [None]:
import nltk

from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

## custom transformer for nltk tokenisation

class NLTKWordPunctTokenizer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super(NLTKWordPunctTokenizer, self).__init__()
        self.stopwords = Param(self, "stopwords", "")
        self._setDefault(stopwords=set())
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, stopwords=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setStopwords(self, value):
        self._paramMap[self.stopwords] = value
        return self

    def getStopwords(self):
        return self.getOrDefault(self.stopwords)

    def _transform(self, dataset):
        stopwords = self.getStopwords()

        def f(s):
            tokens = nltk.tokenize.wordpunct_tokenize(s)
            return [t for t in tokens if t.lower() not in stopwords]

        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [None]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd

## custom transformer to spread spares vectors into individual columns
class VectorMLliber(Transformer):
    """
    A custom Transformer which converts a column of pyspark.ml vectors to multiple pyspark.mllib vectors.
    """

    def __init__(self, inputCol=None):
        super(VectorMLliber, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        def f(v):
            return Vectors.sparse(v.size, v.indices, v.values)
        
        df = df.rdd.map(lambda r: as_mllib_vector(r[0]))
        return df

In [None]:
'''def as_mllib_vector(v):
    return Vectors.sparse(v.size, v.indices, v.values)

features = {}
feature_vec_list = {}
for i in data_array:
    features[i] = word_feat_list[i].select("features")
    feature_vec_list[i] = features[i].rdd.map(lambda r: as_mllib_vector(r[0]))
    feature_vec_list[i].cache()
'''

In [None]:
nltk_tokenizer_body = NLTKWordPunctTokenizer(
    inputCol="clean_body", outputCol="body_words",  
    stopwords=set(nltk.corpus.stopwords.words('english')))

nltk_tokenizer_title = NLTKWordPunctTokenizer(
    inputCol="title", outputCol="title_words",  
    stopwords=set(nltk.corpus.stopwords.words('english')))

from pyspark.ml.feature import CountVectorizer, VectorAssembler

cnt_vectrizr_body = CountVectorizer(inputCol="body_words", outputCol="body_features", minDF=2)

cnt_vectrizr_title = CountVectorizer(inputCol="title_words", outputCol="title_features", minDF=2)

VectorMLliber_body = VectorMLliber(inputCol="body_features")

VectorMLliber_title = VectorMLliber(inputCol="body_title")



processing_pipeline = Pipeline(stages=[
    nltk_tokenizer_body, 
    nltk_tokenizer_title,
    cnt_vectrizr_body,
    cnt_vectrizr_title
])

In [None]:
%%time

## check that pipeline is working (WHICH IT IS NOT)

data_processed = processing_pipeline.fit(datasets['english']).transform(datasets['english'])

show_spark_df(data_processed)

#data_processed.head(2)[0].features.values

In [None]:
processing_ensembler = VectorAssembler(inputCols=["body_features", "title_features"], 
                                         outputCol="features")  

processing_pipeline = Pipeline(stages=[processing_pipeline, processing_ensembler])

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=100,
                      regParam=0.3,
                      elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol="y_ravi",
                      predictionCol="prediction")

# fit linear regression pipeline
pipeline = Pipeline(stages=[processing_pipeline, lr])
trained_pipeline = pipeline.fit(datasets['english'])
trained_pipeline

In [None]:
show_spark_df(trained_pipeline.transform(datasets['english']))

In [None]:
(trained_pipeline
 .transform(datasets['english'])
 .select(
    indep_text_variables + ["prediction"]
 )
 .write
 .parquet("linreg_prediction.parquet")
)

In [None]:
linreg_predictions = spark.read.parquet("linreg_prediction.parquet")

In [None]:
linreg_predictions.toPandas().head()

In [None]:
linreg_predictions.select("prediction").describe().toPandas()

# Save pipelines

In [None]:
from joblib import dump, load
dump(estimator_pipeline, 'pipeline.joblib') 

reloaded = load("pipeline.joblib")

Now we can predict directly!

reloaded.predict(X)[:10]

# Convert notebook to python file

In [None]:
!jupyter nbconvert --to script 0-master-notebook-pipelines.ipynb