# GOALS

* Decide on viewcount threshold to eliminate views?
* Get feature columns working
* Build an LDA model

In [1]:
## testing printing output from console
import subprocess
cmd = [ 'echo', '"Welcome to my PySpark analysis of some StackExchange Data"' ]
output = subprocess.Popen( cmd, stdout=subprocess.PIPE ).communicate()[0]
print(output)

b'"Welcome to my PySpark analysis of some StackExchange Data"\n'


In [2]:
import time
import numpy as np
import pandas as pd
from datetime import datetime

# Load PySpark and Data Structures

In [3]:
%run -i '1-load-pyspark-and-structs.py'

The Spark UI is available at: http://192.168.0.26:4040/ and the defaultParallelism is 4
The data_array is: ['english', 'math', 'rus_stackoverflow', 'stackoverflow', 'superuser']


# Load Data

In [4]:
%%time
print(datetime.now().time())
#%run -i '2-load-datasets.py'

12:25:20.264624
CPU times: user 280 µs, sys: 91 µs, total: 371 µs
Wall time: 317 µs


In [5]:
def show_spark_df(df, n=5):
    '''
    function to better print spark df entries
    '''
    display(pd.DataFrame(df.head(n), columns=df.columns))

# Clean Data

In [6]:
%%time
print(datetime.now().time())
#%run -i '3-clean-datasets.py'

12:25:20.295044
CPU times: user 328 µs, sys: 92 µs, total: 420 µs
Wall time: 359 µs


# EDA (optional)

In [7]:
%%time
print(datetime.now().time())
#%run -i '4-eda.py'

#NB TO DO: Find threshold to delete low views to make sure users that can vote have seen the question

'''
vc_thresh_data = {}

## finding means of viewcounts across fora

for i in data_array:
    vc_thresh_data[i] = datasets[i].select("viewcount").rdd.flatMap(lambda x: x).mean()

vc_thresh_data'''

12:25:20.304171
CPU times: user 222 µs, sys: 72 µs, total: 294 µs
Wall time: 252 µs


# Define Ravi Target Variable and Export Clean Data

In [8]:
%%time
print(f'\033[94m{datetime.now().time()}\033[0m')
#%run -i '5-define-target-export-data.py'

[94m12:25:20.316421[0m
CPU times: user 369 µs, sys: 185 µs, total: 554 µs
Wall time: 456 µs


# Load Clean Data

In [9]:
%%time

## load clean data
for i in data_array:
    datasets[i] = (
        spark
        .read
        .load(f'clean-data/{i}.parquet')
)

CPU times: user 3.34 ms, sys: 2.1 ms, total: 5.43 ms
Wall time: 4.3 s


# Train/Test Split

In [10]:
%%time

## sorting entries according to date column: MIGHT NOT NEED TO DO THIS

'''for i in data_array:
        datasets[i] = datasets[i].sort(col('clean_date')'''
        
## create train and test dictionaries
train = {}
test = {}

from pyspark.sql.functions import lit

## assign train and test sets, based on after certain date
for i in data_array:
    date = '2018-01-01' # date used to split data
    train[i] = datasets[i].filter(datasets[i]['clean_date'] < lit(date))
    test[i] = datasets[i].filter(datasets[i]['clean_date'] >= lit(date))
    numer = test[i].count()
    denom = train[i].count()
    frac = numer/denom
    print(f'{i}: {frac}, from {numer} and {denom}')

english: 0.20050307879670837, from 17616 and 87859
math: 0.22927435077324415, from 198276 and 864798
rus_stackoverflow: 0.4540735776836319, from 79956 and 176086
stackoverflow: 0.20690967926144474, from 60872 and 294196
superuser: 0.12467468017854172, from 44216 and 354651
CPU times: user 27 ms, sys: 7.45 ms, total: 34.4 ms
Wall time: 14 s


In [11]:
'''
interesting to see how skewed rus_stackoverflow posts are to more posts in recent years
'''

'\ninteresting to see how skewed rus_stackoverflow posts are to more posts in recent years\n'

# Create Results Dictionary

In [20]:
RESULTS = {}

# Silly Mean and Median Models

In [21]:
from pyspark.sql.functions import array, lit, struct

## create mean dictionaries
y_ravi_means = {}

## calculate the mean of each forum, using ONLY train set
for i in data_array:
    y_ravi_means[i] = train[i].select('y_ravi').rdd.flatMap(lambda x: x).mean()

In [22]:
## import rmse evaluator
from pyspark.ml.evaluation import RegressionEvaluator

## create baselines dictionary
baselines = {}

## modelling
for i in data_array:

    ## initial variable for timing
    t0 = time.time()
    
    ## train silly mean model by assigning full mean to each row of test
    test[i] = test[i].withColumn('mean_pred', lit(y_ravi_means[i]))

    ## evaluate silly mean model, ONLY on test set
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="mean_pred")
    baselines[i] = round( evaluator.evaluate(test[i]), 5)

    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m mean\033[0m model is {baselines[i]}")

    ## record time taken
    timet = round( time.time() - t0, 0 )
    
    ## store as dictionary inside RESULTS dictionary, initiating dataset name entries first
    RESULTS[i] = {'silly_mean': [baselines[i], 0, timet]}

The root-mean-square error of [94menglish's[0m[92m mean[0m model is 0.02099
The root-mean-square error of [94mmath's[0m[92m mean[0m model is 0.02726
The root-mean-square error of [94mrus_stackoverflow's[0m[92m mean[0m model is 0.02396
The root-mean-square error of [94mstackoverflow's[0m[92m mean[0m model is 0.02202
The root-mean-square error of [94msuperuser's[0m[92m mean[0m model is 0.02028


In [23]:
def show_save_results(results, filename='final-results.csv'):
    '''
    function to print and export modelling results
    '''
    display(pd.DataFrame.from_dict(results).T)
    pd.DataFrame.from_dict(results).T.to_csv(filename)

In [24]:
show_save_results(RESULTS)

Unnamed: 0,silly_mean
english,"[0.02099, 0, 0.0]"
math,"[0.02726, 0, 1.0]"
rus_stackoverflow,"[0.02396, 0, 1.0]"
stackoverflow,"[0.02202, 0, 1.0]"
superuser,"[0.02028, 0, 1.0]"


# Viewcount Model

In [27]:
%%time
#3min 56s

from pyspark.ml.feature import VectorAssembler

## because you can't just pass a column without vectorising it
to_vectors = VectorAssembler(inputCols=["viewcount"], outputCol="features")

from pyspark.ml.pipeline import Pipeline

## create processing pipeline
processing_pipeline = Pipeline(stages=[to_vectors])

## linear regression on just viewcount
from pyspark.ml.regression import LinearRegression
    
lr = LinearRegression(#maxIter=100, # this doesn't change anything
                      #regParam=0.3, # using regularisation parameter here useless since there is one feature
                      #elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol="y_ravi",
                      predictionCol="viewcount_pred")

## make final pipeline
final_pipeline = Pipeline(stages=[to_vectors, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
#.addGrid(lr.regParam, [1e-3, 1.])
#.addGrid(lr.elasticNetParam, [1e-3, 1.])

paramGrid = ParamGridBuilder() \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="viewcount_pred")

## set up cross validation for parameter tuning
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    rmse = round( evaluator.evaluate(crossval.fit(train[i]).transform(test[i])), 6)
    
    ''' CODE TO USE LATER
    cvModel = final_pipeline.fit(datasets[i])
    cvModel.avgMetrics
    list(zip(cvModel.avgMetrics, paramGrid))
    cvModel.bestModel.stages
    cvModel.bestModel.stages[1].extractParamMap()
    '''
        
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m viewcount\033[0m model is {rmse}")

    ## calculate improvement over median baseline
    imprvt = round((rmse/baselines[i] - 1)*-100, 3)
    
    ## record time taken
    timet = round( time.time() - t0, 0 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i]['viewcount'] = [rmse, imprvt, timet]
    
## record results
show_save_results(RESULTS, 'test-set-results.csv')

The root-mean-square error of [94menglish's[0m[92m viewcount[0m model is 0.02095
The root-mean-square error of [94mmath's[0m[92m viewcount[0m model is 0.02723
The root-mean-square error of [94mrus_stackoverflow's[0m[92m viewcount[0m model is 0.02391
The root-mean-square error of [94mstackoverflow's[0m[92m viewcount[0m model is 0.02202
The root-mean-square error of [94msuperuser's[0m[92m viewcount[0m model is 0.02027


Unnamed: 0,silly_mean,viewcount
english,"[0.02099, 0, 0.0]","[0.02095, 0.191, 14.0]"
math,"[0.02726, 0, 1.0]","[0.02723, 0.11, 89.0]"
rus_stackoverflow,"[0.02396, 0, 1.0]","[0.02391, 0.209, 28.0]"
stackoverflow,"[0.02202, 0, 1.0]","[0.02202, -0.0, 47.0]"
superuser,"[0.02028, 0, 1.0]","[0.02027, 0.049, 38.0]"


CPU times: user 1.16 s, sys: 318 ms, total: 1.48 s
Wall time: 3min 36s


In [None]:
'''
Interesting that there are different improvements of viewcount over mean-only prediciton
'''

# Begin Feature Engineering

In [None]:
## garbage collector to speed up computation
import gc
collected = gc.collect()
print("Garbage collector: collected %d objects." % collected)

In [None]:
target = "y_ravi"
indep_text_variables = ["title", "clean_body"]
#.drop('age').collect()

In [None]:
## import elements from natural language toolkit
import nltk
#nltk.download('all') # uncomment after first run as admin check
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
lmtzr = WordNetLemmatizer()

def get_tokens(line):
    '''
    Function to parse text features
    '''
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuations from each word
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stopwords
    words = [w for w in words if not w in stop_words]
    # lemmatizing the words, see https://en.wikipedia.org/wiki/Lemmatisation
    '''lemmatise or stem???'''
    words = [lmtzr.lemmatize(w) for w in words]
    # remove single letters
    words = [word for word in words if not len(word)==1]
    return (words)

In [None]:
import nltk

from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

## custom transformer for nltk tokenisation

class NLTKWordPunctTokenizer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super(NLTKWordPunctTokenizer, self).__init__()
        self.stopwords = Param(self, "stopwords", "")
        self._setDefault(stopwords=set())
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, stopwords=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setStopwords(self, value):
        self._paramMap[self.stopwords] = value
        return self

    def getStopwords(self):
        return self.getOrDefault(self.stopwords)

    def _transform(self, dataset):
        stopwords = self.getStopwords()

        def f(s):
            tokens = nltk.tokenize.wordpunct_tokenize(s)
            return [t for t in tokens if t.lower() not in stopwords]

        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [None]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd

## custom transformer to spread spares vectors into individual columns
class VectorMLliber(Transformer):
    """
    A custom Transformer which converts a column of pyspark.ml vectors to multiple pyspark.mllib vectors.
    """

    def __init__(self, inputCol=None):
        super(VectorMLliber, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        def f(v):
            return Vectors.sparse(v.size, v.indices, v.values)
        
        df = df.rdd.map(lambda r: as_mllib_vector(r[0]))
        return df

In [None]:
'''def as_mllib_vector(v):
    return Vectors.sparse(v.size, v.indices, v.values)

features = {}
feature_vec_list = {}
for i in data_array:
    features[i] = word_feat_list[i].select("features")
    feature_vec_list[i] = features[i].rdd.map(lambda r: as_mllib_vector(r[0]))
    feature_vec_list[i].cache()
'''

In [None]:
nltk_tokenizer_body = NLTKWordPunctTokenizer(
    inputCol="clean_body", outputCol="body_words",  
    stopwords=set(nltk.corpus.stopwords.words('english')))

nltk_tokenizer_title = NLTKWordPunctTokenizer(
    inputCol="title", outputCol="title_words",  
    stopwords=set(nltk.corpus.stopwords.words('english')))

from pyspark.ml.feature import CountVectorizer, VectorAssembler

cnt_vectrizr_body = CountVectorizer(inputCol="body_words", outputCol="body_features", minDF=2)

cnt_vectrizr_title = CountVectorizer(inputCol="title_words", outputCol="title_features", minDF=2)

VectorMLliber_body = VectorMLliber(inputCol="body_features")

VectorMLliber_title = VectorMLliber(inputCol="body_title")



processing_pipeline = Pipeline(stages=[
    nltk_tokenizer_body, 
    nltk_tokenizer_title,
    cnt_vectrizr_body,
    cnt_vectrizr_title
])

In [None]:
%%time

## check that pipeline is working (WHICH IT IS NOT)

data_processed = processing_pipeline.fit(datasets['english']).transform(datasets['english'])

show_spark_df(data_processed)

#data_processed.head(2)[0].features.values

In [None]:
processing_ensembler = VectorAssembler(inputCols=["body_features", "title_features"], 
                                         outputCol="features")  

processing_pipeline = Pipeline(stages=[processing_pipeline, processing_ensembler])

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=100,
                      regParam=0.3,
                      elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol="y_ravi",
                      predictionCol="prediction")

# fit linear regression pipeline
pipeline = Pipeline(stages=[processing_pipeline, lr])
trained_pipeline = pipeline.fit(datasets['english'])
trained_pipeline

In [None]:
show_spark_df(trained_pipeline.transform(datasets['english']))

In [None]:
(trained_pipeline
 .transform(datasets['english'])
 .select(
    indep_text_variables + ["prediction"]
 )
 .write
 .parquet("linreg_prediction.parquet")
)

In [None]:
linreg_predictions = spark.read.parquet("linreg_prediction.parquet")

In [None]:
linreg_predictions.toPandas().head()

In [None]:
linreg_predictions.select("prediction").describe().toPandas()

# Save pipelines

In [None]:
from joblib import dump, load
dump(estimator_pipeline, 'pipeline.joblib') 

reloaded = load("pipeline.joblib")

Now we can predict directly!

reloaded.predict(X)[:10]

# Convert notebook to python file

In [None]:
!jupyter nbconvert --to script 0-master-notebook-pipelines.ipynb