# GOALS

* Finish

In [1]:
## testing printing output from console
import subprocess
cmd = [ 'echo', '"Welcome to my PySpark analysis of some StackExchange Data"' ]
output = subprocess.Popen( cmd, stdout=subprocess.PIPE ).communicate()[0]
print(output)

b'"Welcome to my PySpark analysis of some StackExchange Data"\n'


# Load Libraries

In [2]:
import gc #garbage collection
import time
import numpy as np
import pandas as pd
from datetime import datetime

# Load PySpark

In [3]:
%run -i '1-load-pyspark.py'

The Spark UI, version 2.4.3, is available at: http://192.168.0.26:4040/ and the defaultParallelism is 4


# Load easyFunctions and Transformers

In [4]:
## easy functions
%run -i 'load_parquet_data.py'
%run -i 'show_save_results.py'
%run -i 'show_spark_df.py'

## pipeline transformers
%run -i 'nltkWordPunctTokeniser.py'
%run -i 'nltkSenteniser.py'

# Load Initial or Clean Data

In [5]:
%%time
print(datetime.now().time())
data_array, datasets = load_parquet_data(kind='initial', size='small', printSchema=False)

17:23:49.786908
CPU times: user 5.26 ms, sys: 2.84 ms, total: 8.1 ms
Wall time: 3.55 s


In [6]:
s = 0
for i in data_array:
    s = s + datasets[i].count()
    print(f'{i}: {datasets[i].count()}')
s

buddhism: 5588
economics: 7380
fitness: 8100
health: 5442
interpersonal: 2962


29472

# Clean Data

In [7]:
%%time
print(datetime.now().time())
%run -i '2-clean-datasets.py'

17:23:56.071338

[1m checking columns are the right types and names [0m

----- buddhism -----
root
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_date: timestamp (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- economics -----
root
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_date: timestamp (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- fitness -----
root
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_date: timestamp (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- health -----
root
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_date: timestamp (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- i

# Feature Engineering

In [None]:
%%time
print(datetime.now().time())
%run -i '3-feat-engineering.py'

17:23:59.367758


# EDA

In [10]:
%%time
print(datetime.now().time())
#%run -i '5-final-eda.py'

13:59:50.511577
CPU times: user 227 µs, sys: 183 µs, total: 410 µs
Wall time: 280 µs


# Export Clean Data

In [11]:
%%time
print(datetime.now().time())
#%run -i '4-export-data.py'

13:59:58.482341
CPU times: user 216 µs, sys: 55 µs, total: 271 µs
Wall time: 232 µs


In [12]:
for i in data_array:
    s = i.title() + ' & ' + str(round( datasets[i].stat.corr("score", "viewcount"), 2 )) + ' \\\\'
    print(s)

KeyError: 'buddhism'

# Train/Test Splits

In [14]:
## garbage collector to speed up computation
collected = gc.collect()
print(f'Garbage collector: collected {collected} objects.')

Garbage collector: collected 35 objects.


In [15]:
%%time
print(datetime.now().time())
#%run -i '6a-time-train-test-split-80.py'    
#%run -i '6b-rand-train-test-split-80.py'
#%run -i '6c-time-train-test-split-60.py'
%run -i '6d-rand-train-test-split-60.py'

## check standard deviations of variables
for i in data_array:
    s = i.title() + ' & ' + str(round( pd.to_numeric(train[i].describe('score').select('score').toPandas().iloc[2][0]), 2 )) + ' & ' + \
    str(round( pd.to_numeric(test[i].describe('score').select('score').toPandas().iloc[2][0]), 2 )) + ' \\\\'
    print(s)

11:49:36.161006
Buddhism & 3.6 & 3.63 \\
Economics & 3.62 & 3.07 \\
Fitness & 5.38 & 5.12 \\
Health & 4.11 & 4.07 \\
Interpersonal & 24.95 & 22.78 \\
CPU times: user 102 ms, sys: 31.1 ms, total: 133 ms
Wall time: 4.25 s


In [16]:
## garbage collector to speed up computation
collected = gc.collect()
print(f'Garbage collector: collected {collected} objects.')

Garbage collector: collected 275 objects.


In [17]:
'''
interesting to see how skewed rus_stackoverflow posts are to more posts in recent years
'''

'\ninteresting to see how skewed rus_stackoverflow posts are to more posts in recent years\n'

# Create Results Dictionary

In [21]:
RESULTS = {}
for i in data_array:
    # capitalise keys
    RESULTS[i.title()] = {}

In [19]:
def show_save_results(results, filename='final-results.csv'):
    '''
    function to print and export modelling results
    '''
    display(pd.DataFrame.from_dict(results).T)
    print(pd.DataFrame.from_dict(results).T.to_latex())
    pd.DataFrame.from_dict(results).T.to_csv(filename)

# Silly Mean Model

In [20]:
from pyspark.sql.functions import array, lit, struct

## choose target variable
target = 'score'

## create mean dictionaries
y_ravi_tr_means = {}

## calculate the mean of each forum, using ONLY training set
for i in data_array:
    y_ravi_tr_means[i] = train[i].select(target).rdd.flatMap(lambda x: x).mean()

## import rmse evaluator
from pyspark.ml.evaluation import RegressionEvaluator

## create dictionaries for training and testing (baseline) rmse 
base = {}
tr_rmse = {}

## modelling
for i in data_array:

    ## initial variable for timing
    t0 = time.time()
    
    ## train silly mean model by assigning training set mean for training and testing predictions
    train[i] = train[i].withColumn('mean_pred', lit(y_ravi_tr_means[i]))
    test[i] = test[i].withColumn('mean_pred', lit(y_ravi_tr_means[i]))

    ## evaluate silly mean model, on both training and testing set
    evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='mean_pred')
    tr_rmse[i] = round( evaluator.evaluate(train[i]), 2)
    base[i] = round( evaluator.evaluate(test[i]), 2)

    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m mean\033[0m model is {base[i]}")

    ## record time taken
    timet = round( time.time() - t0, 2 )
    
    ## store as dictionary inside RESULTS dictionary, initiating dataset name entries first
    RESULTS[i.title()]['0silly_mean.0tr_rmse'] = tr_rmse[i]
    RESULTS[i.title()]['0silly_mean.1rmse'] = base[i]
    RESULTS[i.title()]['0silly_mean.2timet'] = timet
    
## record results
#show_save_results(RESULTS)

The root-mean-square error of [94mbuddhism's[0m[92m mean[0m model is 3.63
The root-mean-square error of [94meconomics's[0m[92m mean[0m model is 3.07
The root-mean-square error of [94mfitness's[0m[92m mean[0m model is 5.12
The root-mean-square error of [94mhealth's[0m[92m mean[0m model is 4.08
The root-mean-square error of [94minterpersonal's[0m[92m mean[0m model is 22.77


# Viewcount Model

In [None]:
%%time
print(datetime.now().time())
#3min 56s

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
numic_variables = ['viewcount']
datet_variables = ['clean_date']

## numerical columns
numic_assembler = VectorAssembler(inputCols=numic_variables, outputCol='numic_data') # have to put in single col
standardiser = Q(inputCol='numic_data', outputCol='numic_data_std')    
numic_pipeline = Pipeline(stages=[numic_assembler, standardiser])

'''## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')'''

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['numic_data'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[numic_pipeline, process_assembler])

########################
##### CHOOSE MODEL #####
########################

## linear regression on just viewcount
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(#maxIter=100, # this doesn't change anything
                      #regParam=0.3, # using regularisation parameter here useless since there is one feature
                      #elasticNetParam=0.8,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='viewcount_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
#.addGrid(lr.regParam, [1e-3, 1.])
#.addGrid(lr.elasticNetParam, [1e-3, 1.])
paramGrid = ParamGridBuilder() \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='viewcount_pred')

## set up cross validation for parameter tuning
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    
    ## fitting on train and predicting on train/test
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
        
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m viewcount\033[0m model is {rmse}")

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['1viewcount.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['1viewcount.1rmse'] = rmse
    RESULTS[i.title()]['1viewcount.2imprv'] = impr
    RESULTS[i.title()]['1viewcount.3timet'] = timet
    
## record results
show_save_results(RESULTS)

In [None]:
'''
Interesting that there are different improvements of viewcount over mean-only prediciton
'''

In [None]:
## garbage collector to speed up computation
collected = gc.collect()
print(f'Garbage collector: collected {collected} objects.')

## Count Model

In [None]:
%%time
print(datetime.now().time())
# ENGLISH
# 8min 56s for no CV and no GRIDSEARCH
# 17min 10s for 3-CV and no GRIDSEARCH
# SMALL DATASETS
# 36min 55s for no CV and no GRIDSEARCH
# 12min 35s for 3-CV and no GRIDSEARCH
# 8min 32s for 2-CV and no GRIDSEARCH

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
numic_variables = ['body_word_cnt', 'titl_word_cnt', 'body_char_cnt', 
                   'titl_char_cnt', 'body_sent_cnt', 'titl_sent_cnt']
datet_variables = ['clean_date']

'''## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')'''

## numerical columns
numic_assembler = VectorAssembler(inputCols=numic_variables, outputCol='numic_data') # have to put in single col
standardiser = StandardScaler(inputCol='numic_data', outputCol='numic_data_std')    
numic_pipeline = Pipeline(stages=[numic_assembler, standardiser])

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['numic_data'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[numic_pipeline, process_assembler])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(maxIter=100,
                      regParam=1,
                      elasticNetParam=1,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='counts_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [1e-3, 1.]) \
    .addGrid(lr.regParam, [1e-3, 1.]) \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='counts_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
## create models dict
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m counts\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['2counts.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['2counts.1rmse'] = rmse
    RESULTS[i.title()]['2counts.2imprv'] = impr
    RESULTS[i.title()]['2counts.3timet'] = timet
    RESULTS[i.title()]['2counts.4elastic'] = ela_param
    RESULTS[i.title()]['2counts.5regular'] = reg_param
    
## record results
show_save_results(RESULTS)

## Text Model

In [22]:
%%time
print(datetime.now().time())
# ENGLISH
# 8min 56s for no CV and no GRIDSEARCH
# 17min 10s for 3-CV and no GRIDSEARCH
# SMALL DATASETS
# 36min 55s for no CV and no GRIDSEARCH
# 12min 35s for 3-CV and no GRIDSEARCH
# 8min 32s for 2-CV and no GRIDSEARCH

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
textt_variables = ['title', 'clean_body']
datet_variables = ['clean_date']

## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')

## textual columns
# tokenising text cols with custom transformer
nltk_tokeniser_body = NLTKWordPunctTokeniser(
    inputCol='clean_body', outputCol='body_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

nltk_tokeniser_title = NLTKWordPunctTokeniser(
    inputCol='title', outputCol='titl_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

# count occurence of tokens, i.e. create dfm
cnt_vectrizr_body = CountVectorizer(inputCol='body_words', outputCol='body_raw_feats', minDF=2)
cnt_vectrizr_title = CountVectorizer(inputCol='titl_words', outputCol='titl_raw_feats', minDF=2)

# create IDF dfm
idf_body = IDF(inputCol="body_raw_feats", outputCol="body_feats")
idf_title = IDF(inputCol="titl_raw_feats", outputCol="titl_feats")

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['body_feats', 'titl_feats'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[  #inputCols=['datet_data']
    nltk_tokeniser_body, 
    nltk_tokeniser_title,
    cnt_vectrizr_body,
    cnt_vectrizr_title,
    idf_body,
    idf_title,
    process_assembler
])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(maxIter=100,
                      regParam=1,
                      elasticNetParam=1,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='tokens_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
#    .addGrid(lr.elasticNetParam, [1e-3, 1.])
#    .addGrid(lr.regParam, [1e-3, 1.])
paramGrid = ParamGridBuilder() \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='tokens_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
## create models dict
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m tokens\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['2tokens.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['2tokens.1rmse'] = rmse
    RESULTS[i.title()]['2tokens.2imprv'] = impr
    RESULTS[i.title()]['2tokens.3timet'] = timet
    RESULTS[i.title()]['2tokens.4elastic'] = ela_param
    RESULTS[i.title()]['2tokens.5regular'] = reg_param
    
## record results
show_save_results(RESULTS)

11:50:02.961988
The root-mean-square error of [94mbuddhism's[0m[92m tokens[0m model is 3.63
The root-mean-square error of [94meconomics's[0m[92m tokens[0m model is 3.07
The root-mean-square error of [94mfitness's[0m[92m tokens[0m model is 5.12
The root-mean-square error of [94mhealth's[0m[92m tokens[0m model is 4.08
The root-mean-square error of [94minterpersonal's[0m[92m tokens[0m model is 23.15


Unnamed: 0,2tokens.0tr_rmse,2tokens.1rmse,2tokens.2imprv,2tokens.3timet,2tokens.4elastic,2tokens.5regular
Buddhism,3.6,3.63,-0.0,118.92,1.0,1.0
Economics,3.61,3.07,-0.0,158.11,1.0,1.0
Fitness,5.38,5.12,-0.0,169.01,1.0,1.0
Health,4.11,4.08,-0.0,87.71,1.0,1.0
Interpersonal,18.3,23.15,-1.67,128.64,1.0,1.0


\begin{tabular}{lrrrrrr}
\toprule
{} &  2tokens.0tr\_rmse &  2tokens.1rmse &  2tokens.2imprv &  2tokens.3timet &  2tokens.4elastic &  2tokens.5regular \\
\midrule
Buddhism      &              3.60 &           3.63 &           -0.00 &          118.92 &               1.0 &               1.0 \\
Economics     &              3.61 &           3.07 &           -0.00 &          158.11 &               1.0 &               1.0 \\
Fitness       &              5.38 &           5.12 &           -0.00 &          169.01 &               1.0 &               1.0 \\
Health        &              4.11 &           4.08 &           -0.00 &           87.71 &               1.0 &               1.0 \\
Interpersonal &             18.30 &          23.15 &           -1.67 &          128.64 &               1.0 &               1.0 \\
\bottomrule
\end{tabular}

CPU times: user 3.17 s, sys: 849 ms, total: 4.02 s
Wall time: 11min 2s


In [None]:
## check predictions aren't constant
models['health'].transform(test['health']).select('tokens_pred').take(10)

In [None]:
"""why the heck does everything besides interpersonal have constant predictions - it's not the parameters or the size of the data"""
"""it's the size of the datasets"""

In [None]:
## have a look at CV models params
list(zip(models['health'].avgMetrics, paramGrid))

In [None]:
## extract best parameters
for i in data_array:
    # elasticnet
    ela_key = list(models[i].bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = models[i].bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(models[i].bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = models[i].bestModel.stages[-1].extractParamMap()[reg_key]
    print(i)
    print(f'elastic net: {ela_param}, reg: {reg_param}')

# LDA Model

In [22]:
%%time
print(datetime.now().time())
# ENGLISH
# 8min 56s for no CV and no GRIDSEARCH
# 17min 10s for 3-CV and no GRIDSEARCH
# SMALL DATASETS
# 36min 55s for no CV and no GRIDSEARCH
# 12min 35s for 3-CV and no GRIDSEARCH
# 8min 32s for 2-CV and no GRIDSEARCH

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
textt_variables = ['title', 'clean_body']
datet_variables = ['clean_date']

## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')

## textual columns
# tokenising text cols with custom transformer
nltk_tokeniser_body = NLTKWordPunctTokeniser(
    inputCol='clean_body', outputCol='body_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

nltk_tokeniser_title = NLTKWordPunctTokeniser(
    inputCol='title', outputCol='titl_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

# count occurence of tokens, i.e. create dfm
cnt_vectrizr_body = CountVectorizer(inputCol='body_words', outputCol='body_raw_feats', minDF=2)
cnt_vectrizr_title = CountVectorizer(inputCol='titl_words', outputCol='titl_raw_feats', minDF=2)

# create IDF dfm
idf_body = IDF(inputCol="body_raw_feats", outputCol="body_feats")
idf_title = IDF(inputCol="titl_raw_feats", outputCol="titl_feats")

# get topic distributions from LDA model
from pyspark.ml.clustering import LDA
lda_body = LDA(k=10, maxIter=5, inputCol='body_feats', outputCol='final_body')
lda_title = LDA(k=10, maxIter=5, inputCol='titl_feats', outputCol='final_titl')

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['final_body', 'final_titl'], #inputCols=['datet_data']
                                    outputCol='features') 

process_pipeline = Pipeline(stages=[  #inputCols=['datet_data']
    nltk_tokeniser_body, 
    nltk_tokeniser_title,
    cnt_vectrizr_body,
    cnt_vectrizr_title,
    idf_body,
    idf_title,
    lda_body,
    lda_title,
    process_assembler
])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(maxIter=100,
                      regParam=1,
                      elasticNetParam=1,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='tokens_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
#    .addGrid(lr.elasticNetParam, [1e-3, 1.])
#    .addGrid(lr.regParam, [1e-3, 1.])
paramGrid = ParamGridBuilder() \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='tokens_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
## create models dict
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m tokens\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['2tokens.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['2tokens.1rmse'] = rmse
    RESULTS[i.title()]['2tokens.2imprv'] = impr
    RESULTS[i.title()]['2tokens.3timet'] = timet
    RESULTS[i.title()]['2tokens.4elastic'] = ela_param
    RESULTS[i.title()]['2tokens.5regular'] = reg_param
    
## record results
show_save_results(RESULTS)

11:50:02.961988
The root-mean-square error of [94mbuddhism's[0m[92m tokens[0m model is 3.63
The root-mean-square error of [94meconomics's[0m[92m tokens[0m model is 3.07
The root-mean-square error of [94mfitness's[0m[92m tokens[0m model is 5.12
The root-mean-square error of [94mhealth's[0m[92m tokens[0m model is 4.08
The root-mean-square error of [94minterpersonal's[0m[92m tokens[0m model is 23.15


Unnamed: 0,2tokens.0tr_rmse,2tokens.1rmse,2tokens.2imprv,2tokens.3timet,2tokens.4elastic,2tokens.5regular
Buddhism,3.6,3.63,-0.0,118.92,1.0,1.0
Economics,3.61,3.07,-0.0,158.11,1.0,1.0
Fitness,5.38,5.12,-0.0,169.01,1.0,1.0
Health,4.11,4.08,-0.0,87.71,1.0,1.0
Interpersonal,18.3,23.15,-1.67,128.64,1.0,1.0


\begin{tabular}{lrrrrrr}
\toprule
{} &  2tokens.0tr\_rmse &  2tokens.1rmse &  2tokens.2imprv &  2tokens.3timet &  2tokens.4elastic &  2tokens.5regular \\
\midrule
Buddhism      &              3.60 &           3.63 &           -0.00 &          118.92 &               1.0 &               1.0 \\
Economics     &              3.61 &           3.07 &           -0.00 &          158.11 &               1.0 &               1.0 \\
Fitness       &              5.38 &           5.12 &           -0.00 &          169.01 &               1.0 &               1.0 \\
Health        &              4.11 &           4.08 &           -0.00 &           87.71 &               1.0 &               1.0 \\
Interpersonal &             18.30 &          23.15 &           -1.67 &          128.64 &               1.0 &               1.0 \\
\bottomrule
\end{tabular}

CPU times: user 3.17 s, sys: 849 ms, total: 4.02 s
Wall time: 11min 2s


In [29]:
lda_body = LDA(k=10, maxIter=5, inputCol='body_feats', outputCol='final_body')

TypeError: __init__() got an unexpected keyword argument 'inputCol'

In [15]:
nltk_tokeniser_body = NLTKWordPunctTokeniser(
    inputCol='clean_body', outputCol='body_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

from pyspark.ml.feature import CountVectorizer, IDF, StandardScaler, VectorAssembler, VectorSlicer

cnt_vectrizr_body = CountVectorizer(inputCol='body_words', outputCol='features', minDF=2)

mat1 = nltk_tokeniser_body.transform(datasets['interpersonal'])

# TF
mat2 = cnt_vectrizr_body.fit(mat1).transform(mat1)

In [27]:
%%time
from pyspark.ml.clustering import LDA

lda = LDA(k=10, maxIter=5)

temp = lda.fit(mat2).transform(mat2)

CPU times: user 22.8 ms, sys: 8.67 ms, total: 31.4 ms
Wall time: 1min 14s


In [28]:
show_spark_df(temp)

Unnamed: 0,title,viewcount,score,clean_date,clean_body,body_word_cnt,titl_word_cnt,body_sent_cnt,titl_sent_cnt,body_char_cnt,titl_char_cnt,body_words,features,topicDistribution
0,How to deal with avoiding someone I don't like,6922,38,2017-06-27 17:23:39.670,"Currently, as part of my work, I come into pr...",208,7,11,1,1736,46,"[current, ,, part, work, ,, i, come, proxim, l...","(18.0, 4.0, 23.0, 9.0, 6.0, 1.0, 3.0, 2.0, 1.0...","[0.00044454322049143266, 0.0004370769271664956..."
1,How to handle accidentally bumping into a drun...,1792,15,2017-06-27 17:25:17.937,"Yesterday when I left my office, I bumped int...",57,8,5,1,421,69,"[yesterday, i, left, offic, ,, i, bump, drunk,...","(9.0, 4.0, 7.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0016031470816361172, 0.001576217661595304, ..."
2,How can I get someone to say their first name ...,8415,105,2017-06-27 17:29:06.940,I often face the problem of forgetting the na...,32,13,3,1,256,85,"[i, often, face, problem, forget, name, person...","(4.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.002820006528161007, 0.002772676504581645, 0..."
3,Joining an existing group of friends,1941,20,2017-06-27 17:30:02.927,Many times we try to blend in with some frien...,108,4,10,1,966,36,"[mani, time, tri, blend, friend, ', friend, .,...","(6.0, 8.0, 9.0, 3.0, 0.0, 0.0, 0.0, 2.0, 1.0, ...","[0.0008526431202516887, 0.0008383184900333576,..."
4,"How to react to an excuse if it wasn't ""Ok""?",1545,0,2017-06-27 17:33:09.793,"I was walking down the street, when a rushing...",50,7,4,1,411,44,"[i, walk, street, ,, rush, older, man, barg, ....","(4.0, 2.0, 5.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...","[0.001823467879329865, 0.001792854197665454, 0..."


In [26]:
lda

DistributedLDAModel_91f80a60c3e9

In [27]:
process = VectorAssembler(inputCols=['body_raw_feats'], #inputCols=['datet_data']
                                    outputCol='features') 

In [29]:
%%time

lda_model = lda.transform(process.transform(mat2))

AttributeError: 'NoneType' object has no attribute 'transform'

In [61]:
lda_model.topicsMatrix()

DenseMatrix(9624, 10, [28.1371, 14.3826, 143.8924, 21.5715, 38.1571, 36.4679, 9.8236, 45.3612, ..., 1.7004, 0.9735, 0.8829, 0.8919, 1.9658, 0.7925, 3.4809, 0.9884], 0)

In [64]:
print("Learned topics (as distributions over vocab of " + str(lda_model.vocabSize())
      + " words):")

Learned topics (as distributions over vocab of 9624 words):


In [66]:
lda_model.describeTopics(5).select("termIndices").rdd.map(lambda r: r[0]).collect()

[[731, 2, 236, 172, 70],
 [160, 2347, 1897, 674, 427],
 [7, 2, 9, 10, 20],
 [2, 9, 295, 82, 12],
 [247, 45, 2, 775, 1300],
 [28, 244, 242, 147, 52],
 [2, 18, 181, 8, 62],
 [323, 144, 39, 16, 5],
 [16, 18, 97, 38, 291],
 [46, 36, 371, 51, 1321]]

In [78]:
lda_model.transform(mat3)

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: title, viewcount, score, clean_date, clean_body, body_word_cnt, titl_word_cnt, body_sent_cnt, titl_sent_cnt, body_char_cnt, titl_char_cnt, body_words, body_raw_feats, body_feats'

In [70]:
topics

DenseMatrix(9624, 10, [28.1371, 14.3826, 143.8924, 21.5715, 38.1571, 36.4679, 9.8236, 45.3612, ..., 1.7004, 0.9735, 0.8829, 0.8919, 1.9658, 0.7925, 3.4809, 0.9884], 0)

In [77]:
lda_model.topicDistributions

AttributeError: 'LocalLDAModel' object has no attribute 'topicDistributions'

In [None]:
lda_model_list = {}
for i in data_array:
    lda_model_list[i] = lda.fit(mat3[i])

In [None]:
num_topics = 10
max_iterations = 100
lda_model = LDA.train(mat3[['index','features']].map(list), k=num_topics, maxIterations=max_iterations)

In [None]:
cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

In [None]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd

## custom transformer to spread sparse vectors into individual columns
class VectorMLliber(Transformer):
    """
    A custom Transformer which converts a column of pyspark.ml vectors to multiple pyspark.mllib vectors.
    """

    def __init__(self, inputCol=None):
        super(VectorMLliber, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        def f(v):
            return Vectors.sparse(v.size, v.indices, v.values)
        
        df = df.rdd.map(lambda r: as_mllib_vector(r[0]))
        return df

In [None]:
# ???????????
VectorMLliber_body = VectorMLliber(inputCol='body_features')
VectorMLliber_title = VectorMLliber(inputCol='titl_features')

In [None]:
'''def as_mllib_vector(v):
    return Vectors.sparse(v.size, v.indices, v.values)

features = {}
feature_vec_list = {}
for i in data_array:
    features[i] = word_feat_list[i].select("features")
    feature_vec_list[i] = features[i].rdd.map(lambda r: as_mllib_vector(r[0]))
    feature_vec_list[i].cache()
'''

# Save predictions

In [None]:
(trained_pipeline
 .transform(datasets['english'])
 .select(
    indep_text_variables + ["prediction"]
 )
 .write
 .parquet("linreg_prediction.parquet")
)

In [None]:
linreg_predictions = spark.read.parquet("linreg_prediction.parquet")

In [None]:
linreg_predictions.toPandas().head()

In [None]:
linreg_predictions.select("prediction").describe().toPandas()

# Save pipelines

In [None]:
from joblib import dump, load
dump(estimator_pipeline, 'pipeline.joblib') 

reloaded = load("pipeline.joblib")

#Now we can predict directly!

reloaded.predict(X)[:10]

In [None]:
## save models DOESN'T WORK BECAUSE: 'NLTKWordPunctTokenizer' object has no attribute '_to_java'
for i in data_array:
    param_dict[i].save(f'{i}-pipeline') 

# Convert notebook to python file

In [None]:
!jupyter nbconvert --to script 0-master-notebook-pipelines.ipynb