# GOALS

* Decide on viewcount threshold to eliminate views
* Get feature columns working
* Build an LDA model

In [1]:
## testing printing output from console
import subprocess
cmd = [ 'echo', '"Welcome to my PySpark analysis of some StackExchange Data"' ]
output = subprocess.Popen( cmd, stdout=subprocess.PIPE ).communicate()[0]
print(output)

b'"Welcome to my PySpark analysis of some StackExchange Data"\n'


In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

# Load PySpark

In [3]:
%run -i '1-load-pyspark.py'

The Spark UI is available at: http://192.168.0.26:4040/ and the defaultParallelism is 4


# Load Data

In [4]:
%%time
print(datetime.now().time())
%run -i '2-load-datasets.py'

12:34:38.746175
------------------------
english
------------------------
root
 |-- _Body: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: long (nullable = true)
 |-- _Score: long (nullable = true)

+--------------------+--------------------+----------+------+
|               _Body|              _Title|_ViewCount|_Score|
+--------------------+--------------------+----------+------+
|<p>How do I know ...|What is the diffe...|     18413|    35|
|<p>When you want ...|Should I use a se...|    106724|    52|
|<blockquote>
  <p...|What does Maugham...|      1131|    11|
+--------------------+--------------------+----------+------+
only showing top 3 rows

------------------------
math
------------------------
root
 |-- Body: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ViewCount: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- __index_level_0__: long (nullable = true)

+--------------------+--------------------+--------

In [5]:
def show_spark_df(df, n=5):
    '''
    function to better print spark df entries
    '''
    display(pd.DataFrame(df.head(n), columns=df.columns))
    
show_spark_df(datasets['english'])

Unnamed: 0,_Body,_Title,_ViewCount,_Score
0,<p>How do I know when to use <em>lay</em> and ...,"What is the difference between ""lay"" and ""lie""?",18413,35
1,<p>When you want to connect two closely relate...,Should I use a semicolon or a dash to connect ...,106724,52
2,<blockquote>\n <p><strong>Possible Duplicate:...,"What does Maugham mean by ""his spaghetti were""?",1131,11
3,<p>How do you say it correctly?</p>\n\n<ul>\n<...,"""Adult and children stories"" or ""Adults and ch...",959,2
4,"<p>""Proven"" and ""proved"" both seem to mean the...","What is the difference between ""proven"" and ""p...",52711,50


# Clean Data

In [6]:
%%time
print(datetime.now().time())
%run -i '3-clean-datasets.py'

12:34:49.519166

[1m checking columns are the right types and names [0m

----- english -----
root
 |-- body: string (nullable = true)
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- math -----
root
 |-- body: string (nullable = true)
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- rus_stackoverflow -----
root
 |-- body: string (nullable = true)
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- stackoverflow -----
root
 |-- body: string (nullable = true)
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- superuser -----
root
 |-- 

# EDA (optional)

In [7]:
%%time
print(datetime.now().time())
#%run -i '4-eda.py'

#NB TO DO: Find threshold to delete low views to make sure users that can vote have seen the question

vc_thresh_data = {}

## finding means of viewcounts across fora

for i in data_array:
    vc_thresh_data[i] = datasets[i].select("viewcount").rdd.flatMap(lambda x: x).mean()

vc_thresh_data

12:35:46.018074
CPU times: user 45.8 ms, sys: 14 ms, total: 59.8 ms
Wall time: 17.8 s


# Define Ravi Target Variable

In [8]:
%%time
print(datetime.now().time())
%run -i '5-define-target.py'

12:36:03.794926

The average value of [1menglish[0m y_ravi is 0.0037778


The average value of [1mmath[0m y_ravi is 0.0121854


The average value of [1mrus_stackoverflow[0m y_ravi is 0.006857


The average value of [1mstackoverflow[0m y_ravi is 0.002148


The average value of [1msuperuser[0m y_ravi is 0.0027184

CPU times: user 189 ms, sys: 56.7 ms, total: 246 ms
Wall time: 43.7 s


In [9]:
best_worst_qs['superuser']

Unnamed: 0,body,title,viewcount,score,clean_body,y_ravi
0,<p>I'm trying to install flux on my Lubuntu 15...,Fail to install python-pexpect and python-gobj...,4,2,I'm trying to install flux on my Lubuntu 15.1...,0.5
0,<p>I am trying to connect mysql client to gcp ...,Error while Connecting mysql client to GCP Clo...,7,-3,I am trying to connect mysql client to gcp cl...,-0.428571
1,<p>is such app available on andriod that get w...,Win10 Alerts/Notification/Errors Pushed notifi...,7,-3,is such app available on andriod that get win...,-0.428571


# Create Results Dictionary

In [25]:
RESULTS = {}

# Silly Mean and Median Models

In [26]:
from pyspark.sql.functions import array, lit, struct

## create median and mean dictionaries
y_ravi_medians = {}
y_ravi_means = {}

## create constant median and mean columns for evaluation baseline
for i in data_array:
    y_ravi_medians[i] = datasets[i].approxQuantile('y_ravi', [0.5], 0.25)[0]
    y_ravi_means[i] = datasets[i].select('y_ravi').rdd.flatMap(lambda x: x).mean()

In [34]:
## import rmse evaluator
from pyspark.ml.evaluation import RegressionEvaluator

## create baselines dictionary
baselines = {}

for i in data_array:

    ## train silly median model
    datasets[i] = datasets[i].withColumn('median_pred', lit(y_ravi_medians[i]))

    ## evaluate silly median model
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="median_pred")
    baselines[i] = evaluator.evaluate(datasets[i])

    print("The root-mean-square error of " + "\033[1m"+i+"\033[0m"+ "'s median model is " + str(baselines[i]))

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i] = {'silly_median': [baselines[i], 0]}

The root-mean-square error of [1menglish[0m's median model is 0.011407298175493769
The root-mean-square error of [1mmath[0m's median model is 0.020663146117843787
The root-mean-square error of [1mrus_stackoverflow[0m's median model is 0.018388275325985484
The root-mean-square error of [1mstackoverflow[0m's median model is 0.01428160958495254
The root-mean-square error of [1msuperuser[0m's median model is 0.010638377607797608


In [35]:
for i in data_array:
    
    ## train silly mean model
    datasets[i] = datasets[i].withColumn('mean_pred', lit(y_ravi_means[i]))

    ## evaluate silly mean model
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="mean_pred")
    rmse = evaluator.evaluate(datasets[i])

    print("The root-mean-square error of " + "\033[1m"+i+"\033[0m"+ "'s mean model is " + str(rmse))

    ## calculate improvement over median baseline
    imprvt = (rmse/baselines[i] - 1)*-100

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i] = {'silly_mean': [rmse, imprvt]}

The root-mean-square error of [1menglish[0m's mean model is 0.010981530956769198
The root-mean-square error of [1mmath[0m's mean model is 0.019399287092401008
The root-mean-square error of [1mrus_stackoverflow[0m's mean model is 0.017635677029884285
The root-mean-square error of [1mstackoverflow[0m's mean model is 0.014119152415741939
The root-mean-square error of [1msuperuser[0m's mean model is 0.010293770490768595


In [14]:
pd.DataFrame.from_dict(RESULTS).T

Unnamed: 0,0,1
silly_median,0.011407,0.0
silly_mean,0.011033,3.285437


# Viewcount Model

In [None]:
from pyspark.ml.pipeline import Pipeline

from pyspark.ml.feature import VectorAssembler

to_vectors = VectorAssembler(inputCols=["viewcount"], outputCol="features")

processing_pipeline = Pipeline(stages=[to_vectors])

## linear regression on just viewcount

from pyspark.ml.regression import LinearRegression

In [None]:
lr = LinearRegression(maxIter=100,
                      regParam=0.3,
                      elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol="y_ravi",
                      predictionCol="viewcount_pred")

In [None]:
## make final pipeline



final_pipeline = Pipeline(stages=[processing_pipeline, lr])

In [None]:
trained_pipeline = final_pipeline.fit(datasets['english'])

show_spark_df(trained_pipeline.transform(datasets['english']))

In [None]:
## evaluate model
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="y_ravi", predictionCol="viewcount_pred")

rmse = evaluator.evaluate(datasets['english'])

print("Root-mean-square error = " + str(rmse))

imprvt = (rmse/baseline - 1)*-100

RESULTS['viewcount'] = [rmse, imprvt]

# Create Pipelines for Modeling

In [None]:
## garbage collector to speed up computation
import gc
collected = gc.collect()
print("Garbage collector: collected %d objects." % collected)

In [None]:
from pyspark.ml.pipeline import Pipeline

In [None]:
target = "y_ravi"
indep_text_variables = ["title", "clean_body"]
#.drop('age').collect()

In [None]:
## import elements from natural language toolkit
import nltk
#nltk.download('all') # uncomment after first run as admin check
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
lmtzr = WordNetLemmatizer()

def get_tokens(line):
    '''
    Function to parse text features
    '''
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuations from each word
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stopwords
    words = [w for w in words if not w in stop_words]
    # lemmatizing the words, see https://en.wikipedia.org/wiki/Lemmatisation
    '''lemmatise or stem???'''
    words = [lmtzr.lemmatize(w) for w in words]
    # remove single letters
    words = [word for word in words if not len(word)==1]
    return (words)

In [None]:
import nltk

from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

## custom transformer for nltk tokenisation

class NLTKWordPunctTokenizer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super(NLTKWordPunctTokenizer, self).__init__()
        self.stopwords = Param(self, "stopwords", "")
        self._setDefault(stopwords=set())
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, stopwords=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setStopwords(self, value):
        self._paramMap[self.stopwords] = value
        return self

    def getStopwords(self):
        return self.getOrDefault(self.stopwords)

    def _transform(self, dataset):
        stopwords = self.getStopwords()

        def f(s):
            tokens = nltk.tokenize.wordpunct_tokenize(s)
            return [t for t in tokens if t.lower() not in stopwords]

        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [None]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd

## custom transformer to spread spares vectors into individual columns
class VectorMLliber(Transformer):
    """
    A custom Transformer which converts a column of pyspark.ml vectors to multiple pyspark.mllib vectors.
    """

    def __init__(self, inputCol=None):
        super(VectorMLliber, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        def f(v):
            return Vectors.sparse(v.size, v.indices, v.values)
        
        df = df.rdd.map(lambda r: as_mllib_vector(r[0]))
        return df

In [None]:
'''def as_mllib_vector(v):
    return Vectors.sparse(v.size, v.indices, v.values)

features = {}
feature_vec_list = {}
for i in data_array:
    features[i] = word_feat_list[i].select("features")
    feature_vec_list[i] = features[i].rdd.map(lambda r: as_mllib_vector(r[0]))
    feature_vec_list[i].cache()
'''

In [None]:
nltk_tokenizer_body = NLTKWordPunctTokenizer(
    inputCol="clean_body", outputCol="body_words",  
    stopwords=set(nltk.corpus.stopwords.words('english')))

nltk_tokenizer_title = NLTKWordPunctTokenizer(
    inputCol="title", outputCol="title_words",  
    stopwords=set(nltk.corpus.stopwords.words('english')))

from pyspark.ml.feature import CountVectorizer, VectorAssembler

cnt_vectrizr_body = CountVectorizer(inputCol="body_words", outputCol="body_features", minDF=2)

cnt_vectrizr_title = CountVectorizer(inputCol="title_words", outputCol="title_features", minDF=2)

VectorMLliber_body = VectorMLliber(inputCol="body_features")

VectorMLliber_title = VectorMLliber(inputCol="body_title")



processing_pipeline = Pipeline(stages=[
    nltk_tokenizer_body, 
    nltk_tokenizer_title,
    cnt_vectrizr_body,
    cnt_vectrizr_title
])

In [None]:
%%time

## check that pipeline is working (WHICH IT IS NOT)

data_processed = processing_pipeline.fit(datasets['english']).transform(datasets['english'])

show_spark_df(data_processed)

#data_processed.head(2)[0].features.values

In [None]:
processing_ensembler = VectorAssembler(inputCols=["body_features", "title_features"], 
                                         outputCol="features")  

processing_pipeline = Pipeline(stages=[processing_pipeline, processing_ensembler])

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=100,
                      regParam=0.3,
                      elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol="y_ravi",
                      predictionCol="prediction")

# fit linear regression pipeline
pipeline = Pipeline(stages=[processing_pipeline, lr])
trained_pipeline = pipeline.fit(datasets['english'])
trained_pipeline

In [None]:
show_spark_df(trained_pipeline.transform(datasets['english']))

In [None]:
(trained_pipeline
 .transform(datasets['english'])
 .select(
    indep_text_variables + ["prediction"]
 )
 .write
 .parquet("linreg_prediction.parquet")
)

In [None]:
linreg_predictions = spark.read.parquet("linreg_prediction.parquet")

In [None]:
linreg_predictions.toPandas().head()

In [None]:
linreg_predictions.select("prediction").describe().toPandas()

# Save pipelines

In [None]:
from joblib import dump, load
dump(estimator_pipeline, 'pipeline.joblib') 

reloaded = load("pipeline.joblib")

Now we can predict directly!

reloaded.predict(X)[:10]

# Convert notebook to python file

In [None]:
!jupyter nbconvert --to script 0-master-notebook-pipelines.ipynb