# GOALS

* Decide on viewcount threshold to eliminate views?
* Get feature columns working
* Build an LDA model

In [273]:
## testing printing output from console
import subprocess
cmd = [ 'echo', '"Welcome to my PySpark analysis of some StackExchange Data"' ]
output = subprocess.Popen( cmd, stdout=subprocess.PIPE ).communicate()[0]
print(output)

b'"Welcome to my PySpark analysis of some StackExchange Data"\n'


In [274]:
import time
import numpy as np
import pandas as pd
from datetime import datetime

# Load PySpark and Data Structures

In [275]:
%run -i '1-load-pyspark.py'

## array of dataset names to loop through in analysis
data_array = [
    "english",
    "rus_stackoverflow",
    "superuser",
    "stackoverflow",
    "math"
]

# overwrite to be last one
data_array = [
    "buddhism",
    "economics",
    "fitness",
    "health",
    "interpersonal"
]

datasets = {}   

The Spark UI is available at: http://192.168.0.26:4040/ and the defaultParallelism is 4


# Load Data

In [276]:
%%time
print(datetime.now().time())
#%run -i '2-load-datasets.py'

09:56:59.351105
CPU times: user 179 µs, sys: 125 µs, total: 304 µs
Wall time: 276 µs


In [277]:
def show_spark_df(df, n=5):
    '''
    function to better print spark df entries
    '''
    display(pd.DataFrame(df.head(n), columns=df.columns))

# Clean Data

In [278]:
%%time
print(datetime.now().time())
#%run -i '3-clean-datasets.py'

09:57:02.483494
CPU times: user 348 µs, sys: 117 µs, total: 465 µs
Wall time: 405 µs


# Define Target

In [282]:
%%time
print(datetime.now().time())
#%run -i '4a-define-target.py'

09:58:00.901639
The average value of [94mbuddhism[0m y_ravi is 0.0133488
The average value of [94meconomics[0m y_ravi is 0.0210069
The average value of [94mfitness[0m y_ravi is 0.0093298
The average value of [94mhealth[0m y_ravi is 0.0348693
The average value of [94minterpersonal[0m y_ravi is 0.0057534
CPU times: user 228 ms, sys: 52.3 ms, total: 280 ms
Wall time: 2.88 s


# EDA

In [285]:
%%time
print(datetime.now().time())
%run -i '4b-final-eda.py'

10:00:26.481748
Buddhism:
2014-06-17 19:22:29.860000
2019-03-02 22:25:47.057000

Economics:
2014-11-18 20:59:30.327000
2019-03-03 02:44:17.160000

Fitness:
2011-03-01 19:49:22.470000
2019-03-03 02:49:38.280000

Health:
2015-03-31 19:00:01.793000
2019-03-02 12:35:20.623000

Interpersonal:
2017-06-27 17:23:39.670000
2019-03-02 23:34:48.340000

----- buddhism -----
+-------+-----------------+
|summary|        viewcount|
+-------+-----------------+
|  count|             5588|
|   mean|672.9930207587688|
| stddev|2198.592343349931|
|    min|               10|
|    max|            53367|
+-------+-----------------+

----- economics -----
+-------+------------------+
|summary|         viewcount|
+-------+------------------+
|  count|              7380|
|   mean| 562.9578590785908|
| stddev|2369.9321410432945|
|    min|                 2|
|    max|             68856|
+-------+------------------+

----- fitness -----
+-------+------------------+
|summary|         viewcount|
+-------+-----------

Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "


CPU times: user 10.5 s, sys: 1.24 s, total: 11.8 s
Wall time: 22.3 s


# Export Clean Data with Target

In [8]:
%%time
print(datetime.now().time())
#%run -i '5-export-data.py'

11:43:31.259739
CPU times: user 208 µs, sys: 118 µs, total: 326 µs
Wall time: 340 µs


# Load Cleaned Data

In [280]:
%%time

## load clean data
for i in data_array:
    datasets[i] = (
        spark
        .read
        .load(f'clean-data/{i}.parquet')
)

CPU times: user 2.69 ms, sys: 2.17 ms, total: 4.86 ms
Wall time: 968 ms


# Train/Test Split Manually

In [10]:
%%time
print(datetime.now().time())
%run -i '6-train-test-split.py'

11:43:34.997472
economics: 0.19688614985403827, from 1214 and 6166
buddhism: 0.20017182130584193, from 932 and 4656
fitness: 0.20499851234751562, from 1378 and 6722
health: 0.18846909805634418, from 863 and 4579
interpersonal: 0.2189300411522634, from 532 and 2430
CPU times: user 34 ms, sys: 9.78 ms, total: 43.8 ms
Wall time: 6.66 s


In [11]:
'''
interesting to see how skewed rus_stackoverflow posts are to more posts in recent years
'''

'\ninteresting to see how skewed rus_stackoverflow posts are to more posts in recent years\n'

# Create Results Dictionary

In [12]:
RESULTS = {}
for i in data_array:
    # capitalise keys
    RESULTS[i.title()] = {}

In [13]:
def show_save_results(results, filename='final-results.csv'):
    '''
    function to print and export modelling results
    '''
    display(pd.DataFrame.from_dict(results).T)
    print(pd.DataFrame.from_dict(results).T.to_latex())
    pd.DataFrame.from_dict(results).T.to_csv(filename)

# Silly Mean Model

In [20]:
from pyspark.sql.functions import array, lit, struct

## create mean dictionaries
y_ravi_tr_means = {}

## calculate the mean of each forum, using ONLY training set
for i in data_array:
    y_ravi_tr_means[i] = train[i].select('y_ravi').rdd.flatMap(lambda x: x).mean()

## import rmse evaluator
from pyspark.ml.evaluation import RegressionEvaluator

## create dictionaries for training and testing (baseline) rmse 
base = {}
tr_rmse = {}

## modelling
for i in data_array:

    ## initial variable for timing
    t0 = time.time()
    
    ## train silly mean model by assigning training set mean for training and testing predictions
    train[i] = train[i].withColumn('mean_pred', lit(y_ravi_tr_means[i]))
    test[i] = test[i].withColumn('mean_pred', lit(y_ravi_tr_means[i]))

    ## evaluate silly mean model, on both training and testing set
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="mean_pred")
    tr_rmse[i] = round( evaluator.evaluate(train[i]), 6)
    base[i] = round( evaluator.evaluate(test[i]), 6)

    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m mean\033[0m model is {base[i]}")

    ## record time taken
    timet = round( time.time() - t0, 2 )
    
    ## store as dictionary inside RESULTS dictionary, initiating dataset name entries first
    RESULTS[i.title()]['0silly_mean.0tr_rmse'] = tr_rmse[i]
    RESULTS[i.title()]['0silly_mean.1te_rmse'] = base[i]
    RESULTS[i.title()]['0silly_mean.2timet'] = timet
    
## record results
show_save_results(RESULTS)

# Viewcount Model

In [None]:
%%time
print(datetime.now().time())
#3min 56s

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'y_ravi'
numic_variables = ['viewcount']
datet_variables = ['clean_date']

## numerical columns
numic_assembler = VectorAssembler(inputCols=numic_variables, outputCol='numic_data') # have to put in single col
standardiser = StandardScaler(inputCol='numic_data', outputCol='numic_data_std')    
numic_pipeline = Pipeline(stages=[numic_assembler, standardiser])

## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['numic_data'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[numic_pipeline, process_assembler])

########################
##### CHOOSE MODEL #####
########################

## linear regression on just viewcount
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(#maxIter=100, # this doesn't change anything
                      #regParam=0.3, # using regularisation parameter here useless since there is one feature
                      #elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol=target,
                      predictionCol="viewcount_pred")

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
#.addGrid(lr.regParam, [1e-3, 1.])
#.addGrid(lr.elasticNetParam, [1e-3, 1.])
paramGrid = ParamGridBuilder() \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="y_ravi", predictionCol="viewcount_pred")

## set up cross validation for parameter tuning
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fitting on train and predicting on train/test
    tr_rmse = round( evaluator.evaluate(crossval.fit(train[i]).transform(train[i])), 6)
    rmse = round( evaluator.evaluate(crossval.fit(train[i]).transform(test[i])), 6)
        
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m viewcount\033[0m model is {rmse}")

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 3 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['1viewcount.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['1viewcount.1rmse'] = rmse
    RESULTS[i.title()]['1viewcount.2imprv'] = impr
    RESULTS[i.title()]['1viewcount.3timet'] = timet
    
## record results
show_save_results(RESULTS)

In [None]:
'''
Interesting that there are different improvements of viewcount over mean-only prediciton
'''

# Begin Feature Engineering

In [None]:
## garbage collector to speed up computation
import gc
collected = gc.collect()
print('Garbage collector: collected %d objects.' % collected)

In [15]:
#######################
###### TOKENISER ######
#######################

import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

## custom transformer for nltk tokenisation
class NLTKWordPunctTokeniser(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super(NLTKWordPunctTokeniser, self).__init__()
        self.stopwords = Param(self, "stopwords", "")
        self._setDefault(stopwords=set())
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, stopwords=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setStopwords(self, value):
        self._paramMap[self.stopwords] = value
        return self

    def getStopwords(self):
        return self.getOrDefault(self.stopwords)

    def _transform(self, dataset):
        stopwords = self.getStopwords()

        def f(s):
            # get tokens with separate punctuation
            tokens = nltk.tokenize.wordpunct_tokenize(s)
            # sort out russian stopwords
            #if ('я' in tokens):
            #    tokens = [t for t in tokens if t not in nltk.corpus.stopwords.words('russian')]
            # remove english stopwords
            tokens = [t for t in tokens if t not in stopwords]
            # remove single letters
            #tokens = [t for t in tokens if not len(t)==1] # this removes single punctuation as well
            # stemming the words
            tokens = [ps.stem(t) for t in tokens]
            # convert to lower case
            return [t.lower() for t in tokens]
    
        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [26]:
%%time
print(datetime.now().time())
# ENGLISH
# 8min 56s for no CV and no GRIDSEARCH
# 17min 10s for 3-CV and no GRIDSEARCH
# SMALL DATASETS
# 36min 55s for no CV and no GRIDSEARCH
# 12min 35s for 3-CV and no GRIDSEARCH
# 8min 32s for 2-CV and no GRIDSEARCH

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'y_ravi'
textt_variables = ['title', 'clean_body']
datet_variables = ['clean_date']

## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')

## textual columns
# tokenising text cols with custom transformer
nltk_tokeniser_body = NLTKWordPunctTokeniser(
    inputCol='clean_body', outputCol='body_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

nltk_tokeniser_title = NLTKWordPunctTokeniser(
    inputCol='title', outputCol='titl_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

# count occurence of tokens, i.e. create dfm
cnt_vectrizr_body = CountVectorizer(inputCol='body_words', outputCol='body_feats', minDF=2)
cnt_vectrizr_title = CountVectorizer(inputCol='titl_words', outputCol='titl_feats', minDF=2)

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['body_feats', 'titl_feats'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[  #inputCols=['datet_data']
    nltk_tokeniser_body, 
    nltk_tokeniser_title,
    #token_counter_body,
    #token_counter_title,
    cnt_vectrizr_body,
    cnt_vectrizr_title,
    process_assembler
])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(maxIter=100,
                      regParam=1,
                      elasticNetParam=0.001,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='tokens_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [1e-3, 1.]) \
    .addGrid(lr.regParam, [1e-3, 1.]) \
    .build()


## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol='y_ravi', predictionCol='tokens_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 6)
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 6)
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m tokens\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 3 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['2tokens.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['2tokens.1rmse'] = rmse
    RESULTS[i.title()]['2tokens.2imprv'] = impr
    RESULTS[i.title()]['2tokens.3timet'] = timet
    RESULTS[i.title()]['2tokens.4elastic'] = ela_param
    RESULTS[i.title()]['2tokens.5regular'] = reg_param
    
## record results
show_save_results(RESULTS, 'token-full-results-3-cv.csv')

12:38:12.540618
The root-mean-square error of [94meconomics's[0m[92m tokens[0m model is 0.037769
The root-mean-square error of [94mbuddhism's[0m[92m tokens[0m model is 0.01574
The root-mean-square error of [94mfitness's[0m[92m tokens[0m model is 0.020767
The root-mean-square error of [94mhealth's[0m[92m tokens[0m model is 0.040057
The root-mean-square error of [94minterpersonal's[0m[92m tokens[0m model is 0.008395


Unnamed: 0,2tokens.0tr_rmse,2tokens.1rmse,2tokens.2imprv,2tokens.3timet,2tokens.4elastic,2tokens.5regular
Economics,0.026446,0.037769,0.061,606.66,0.001,1.0
Buddhism,0.014281,0.01574,0.057,385.5,1.0,0.001
Fitness,0.012305,0.020767,-0.0,571.22,0.001,1.0
Health,0.03503,0.040057,0.274,374.23,0.001,1.0
Interpersonal,0.004935,0.008395,-0.0,458.33,1.0,1.0


\begin{tabular}{lrrrrrr}
\toprule
{} &  2tokens.0tr\_rmse &  2tokens.1rmse &  2tokens.2imprv &  2tokens.3timet &  2tokens.4elastic &  2tokens.5regular \\
\midrule
Economics     &          0.026446 &       0.037769 &           0.061 &          606.66 &             0.001 &             1.000 \\
Buddhism      &          0.014281 &       0.015740 &           0.057 &          385.50 &             1.000 &             0.001 \\
Fitness       &          0.012305 &       0.020767 &          -0.000 &          571.22 &             0.001 &             1.000 \\
Health        &          0.035030 &       0.040057 &           0.274 &          374.23 &             0.001 &             1.000 \\
Interpersonal &          0.004935 &       0.008395 &          -0.000 &          458.33 &             1.000 &             1.000 \\
\bottomrule
\end{tabular}

CPU times: user 12.7 s, sys: 3.52 s, total: 16.2 s
Wall time: 39min 56s


In [28]:
## check predictions aren't constant
models['interpersonal'].transform(test['interpersonal']).select('tokens_pred').take(10)

[Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045),
 Row(tokens_pred=0.005270118290217045)]

In [None]:
"""why the heck do fitness and interpersonal have constant predictions - it's not the parameters or the size of the data"""

In [None]:
## have a look at CV models params
list(zip(models['health'].avgMetrics, paramGrid))

In [None]:
## extract best parameters
for i in data_array:
    # elasticnet
    ela_key = list(models[i].bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = models[i].bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(models[i].bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = models[i].bestModel.stages[-1].extractParamMap()[reg_key]
    print(i)
    print(f'elastic net: {ela_param}, reg: {reg_param}')

# Trying to get sentence and token count to work

In [None]:
df_ = spark.createDataFrame(datasets['economics'].take(5))

In [None]:
show_spark_df(df_)

In [None]:
word_feat_list = tester4.transform( tester3.fit(tester.transform(df_)).transform(tester.transform(df_)) )

In [None]:
word_feat_list

In [None]:
word_feat_list.select('features').collect()

In [None]:
show_spark_df(word_feat_list)

In [None]:
from pyspark.mllib.linalg import Vectors
def as_mllib_vector(v):
    return Vectors.sparse(v.size, v.indices, v.values)

In [None]:
word_feat_list.select("features").rdd.map(lambda r: r[0][0]).collect()

In [None]:
word_feat_list.select("features").rdd.map(lambda r: r[0]).map(lambda r: (r,1)).reduceByKey(lambda a,b: a+b).collect()

In [None]:
tester = NLTKWordPunctTokeniser(
    inputCol='title', outputCol='title_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

tester2 = NLTKWordPunctTokeniser(
    inputCol='clean_body', outputCol='body_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

In [None]:
tester3 = CountVectorizer(inputCol="title_words", outputCol="features")

In [None]:
tester4 = NLTKCounter(inputCol='features', outputCol='final_features')

In [None]:
########################
###### SENTENISER ######
########################

import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

## custom transformer for nltk tokenisation
class NLTKSenteniser(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super(NLTKWordPunctTokenizer, self).__init__()
        self.stopwords = Param(self, "stopwords", "")
        self._setDefault(stopwords=set())
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, stopwords=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setStopwords(self, value):
        self._paramMap[self.stopwords] = value
        return self

    def getStopwords(self):
        return self.getOrDefault(self.stopwords)

    def _transform(self, dataset):
        stopwords = self.getStopwords()

        def f(s):
            # get tokens with separate punctuation
            tokens = nltk.tokenize.wordpunct_tokenize(s)
            # sort out russian stopwords
            if ('я' in tokens):
                tokens = [t for t in tokens if t not in nltk.corpus.stopwords.words('russian')]
            # remove english stopwords
            tokens = [t for t in tokens if t not in stopwords]
            # remove single letters
            tokens = [t for t in tokens if not len(t)==1]
            # stemming the words
            tokens = [ps.stem(t) for t in tokens]
            # convert to lower case
            return [t.lower() for t in tokens]
    
        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [None]:
#######################
####### COUNTER #######
#######################

import nltk
from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

## custom transformer for nltk tokenisation
class NLTKCounter(Transformer, HasInputCol, HasOutputCol):
    '''
    Takes in cnt_vectrizr to count number of tokens per question
    '''

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(NLTKCounter, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):

        def f(s):
            return 1
    
        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [None]:
a

In [None]:
## import elements from natural language toolkit
import nltk
#nltk.download('all') # uncomment after first run as admin check
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
lmtzr = WordNetLemmatizer()

def get_tokens(line):
    '''
    Function to parse text features
    '''
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuations from each word
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stopwords
    words = [w for w in words if not w in stop_words]
    # lemmatizing the words, see https://en.wikipedia.org/wiki/Lemmatisation
    '''lemmatise or stem???'''
    words = [lmtzr.lemmatize(w) for w in words]
    # remove single letters
    words = [word for word in words if not len(word)==1]
    return (words)

In [None]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd

## custom transformer to spread sparse vectors into individual columns
class VectorMLliber(Transformer):
    """
    A custom Transformer which converts a column of pyspark.ml vectors to multiple pyspark.mllib vectors.
    """

    def __init__(self, inputCol=None):
        super(VectorMLliber, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        def f(v):
            return Vectors.sparse(v.size, v.indices, v.values)
        
        df = df.rdd.map(lambda r: as_mllib_vector(r[0]))
        return df

In [None]:
# ???????????
VectorMLliber_body = VectorMLliber(inputCol='body_features')
VectorMLliber_title = VectorMLliber(inputCol='titl_features')

In [None]:
'''def as_mllib_vector(v):
    return Vectors.sparse(v.size, v.indices, v.values)

features = {}
feature_vec_list = {}
for i in data_array:
    features[i] = word_feat_list[i].select("features")
    feature_vec_list[i] = features[i].rdd.map(lambda r: as_mllib_vector(r[0]))
    feature_vec_list[i].cache()
'''

# Save predictions

In [None]:
(trained_pipeline
 .transform(datasets['english'])
 .select(
    indep_text_variables + ["prediction"]
 )
 .write
 .parquet("linreg_prediction.parquet")
)

In [None]:
linreg_predictions = spark.read.parquet("linreg_prediction.parquet")

In [None]:
linreg_predictions.toPandas().head()

In [None]:
linreg_predictions.select("prediction").describe().toPandas()

# Save pipelines

In [None]:
from joblib import dump, load
dump(estimator_pipeline, 'pipeline.joblib') 

reloaded = load("pipeline.joblib")

#Now we can predict directly!

reloaded.predict(X)[:10]

In [None]:
## save models DOESN'T WORK BECAUSE: 'NLTKWordPunctTokenizer' object has no attribute '_to_java'
for i in data_array:
    param_dict[i].save(f'{i}-pipeline') 

# Convert notebook to python file

In [None]:
!jupyter nbconvert --to script 0-master-notebook-pipelines.ipynb