In [35]:
## testing printing output from console
import subprocess

cmd = [ 'echo', '"Welcome to my PySpark analysis of some StackExchange Data"' ]
output = subprocess.Popen( cmd, stdout=subprocess.PIPE ).communicate()[0]
print(output)

b'"Welcome to my PySpark analysis of some StackExchange Data"\n'


# Load Libraries

In [36]:
import gc #garbage collection
import time
import numpy as np
import pandas as pd
from datetime import datetime

# Load PySpark

In [37]:
%run -i '1-load-pyspark.py'

The Spark UI, version 2.4.3, is available at: http://192.168.0.26:4040/ and the defaultParallelism is 4


# Load easyFunctions and Transformers

In [38]:
## easy functions
%run -i 'load_parquet_data.py'
%run -i 'convert_csv_to_pyarrow_parquet.py'
%run -i 'export_parquet_data.py'
%run -i 'count_total_questions.py'
%run -i 'show_save_results.py'
%run -i 'show_spark_df.py'
%run -i 'show_date_range.py'
%run -i 'trim_betw_dates.py'

## pipeline transformers
%run -i 'nltkWordPunctTokeniser.py'
%run -i 'nltkSenteniser.py'

# Choose Datasets

In [39]:
## these are the forum labels that will appear in graphs/tables etc.
data_array = [
    #'Buddhism',
    #'Economics',
    #'Fitness',
    #'Health',
    #'Interpersonal'
    #'Stackoverflow',
    #'Superuser',
    'Math',
    'Stats',
    'English',
]

# Download XML Data

In [40]:
## call download script - EDIT to feed in user data_array
#!bash 0-dataset-download.sh

# Convert to Parquet

In [41]:
## for stackoverflow
#convert_csv_to_pyarrow_parquet(str_dir='initial-data/stackoverflow.stackexchange.com/')

In [42]:
## loop through data_array
#for i in data_array:
#    s = i.lower()
#    !python convert-xml-to-parquet.py "$s"

# Load Initial or Clean Data

In [43]:
%%time
print(datetime.now().time())
datasets = load_parquet_data(dataArray=data_array, kind='initial', printSchema=False)

13:42:32.601525
CPU times: user 3.18 ms, sys: 2.11 ms, total: 5.28 ms
Wall time: 501 ms


In [44]:
## join Stack Overflow datasets on gcloud
#dfUnion = df1.union(df2)

In [45]:
## count questions before trimming
count_total_questions(dataArray=data_array, datasetDict=datasets)

Math: 1102812
Stats: 139302
English: 105475

Total: 1347589


# Clean and Trim Data

In [46]:
%%time
print(datetime.now().time())
%run -i '2-clean-datasets.py'

13:42:34.135001

[1m checking columns are the right types and names [0m

----- Math -----
root
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_date: timestamp (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- Stats -----
root
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_date: timestamp (nullable = true)
 |-- clean_body: string (nullable = true)

None
----- English -----
root
 |-- title: string (nullable = true)
 |-- viewcount: long (nullable = true)
 |-- score: long (nullable = true)
 |-- clean_date: timestamp (nullable = true)
 |-- clean_body: string (nullable = true)

None

[1m checking that there are no nans [0m

----- Math -----
+-----+---------+-----+----------+
|title|viewcount|score|clean_body|
+-----+---------+-----+----------+
|    0|        0|    0|         0|
+-----+---------+-----+----------+

----- 

In [47]:
## show range of dates for datasets before trimming
show_date_range(dataArray=data_array, datasetDict=datasets)

Math:
2010-03-27 14:33:20.727000
2019-06-02 04:16:34.197000

Stats:
2009-02-02 14:21:12.103000
2019-06-02 04:19:33.143000

English:
2009-06-16 13:06:53.033000
2019-06-02 04:02:54.070000



In [48]:
## trim data between uniform date range
datasets = trim_betw_dates(dataArray=data_array, datasetDict=datasets, dates=('2010-09-01', '2011-09-01'))

In [49]:
## show range of dates for datasets after trimming
show_date_range(dataArray=data_array, datasetDict=datasets)

Math:
2010-09-01 02:55:46.223000
2011-08-31 23:35:26.480000

Stats:
2010-09-01 05:55:30.923000
2011-08-31 22:41:00.767000

English:
2010-09-01 03:12:19.943000
2011-08-31 22:27:31.583000



In [50]:
## count questions after trimming
count_total_questions(data_array, temp)

Math: 18131
Stats: 4068
English: 8537

Total: 30736


# Counts and LDA Feature Engineering

In [51]:
%%time
# about 15 min with 10 iterations and params
# FAILS for large datasets
print(datetime.now().time())
%run -i '3-feat-engineering.py'

###
# TRY different values of alpha and rho
# TRY different values of K
# TRY 'em' optimizer instead of 'online'

13:43:38.885624
On to Math
On to Stats
On to English
On to Math
On to Stats
On to English
CPU times: user 1.04 s, sys: 562 ms, total: 1.6 s
Wall time: 23min 16s


# Export Clean Data

In [19]:
%%time
# about 8 min
print(datetime.now().time())
export_parquet_data(dataArray=data_array, datasetDict=datasets)

15:13:44.175946
On to Superuser
On to Math
On to Stats
On to English
CPU times: user 13.6 ms, sys: 15.4 ms, total: 28.9 ms
Wall time: 35.6 s


# EDA

In [18]:
%%time
print(datetime.now().time())
%run -i '4-final-eda.py'

15:55:18.897768

[1mCorrelations between score and viewcount[0m

Buddhism & 0.41 \\
Economics & 0.67 \\
Fitness & 0.26 \\
Health & 0.21 \\
Interpersonal & 0.87 \\

[1mViewcount[0m descriptives

\begin{tabular}{lrrrrr}
\toprule
0 &  count &     mean &   stddev &  min &    max \\
\midrule
Buddhism      &   3120 &   316.16 &   737.08 &   10 &  21498 \\
Economics     &   3120 &   178.24 &   646.26 &    2 &  14055 \\
Fitness       &   3120 &   534.35 &  2426.63 &    8 &  78829 \\
Health        &   3120 &   240.40 &  1128.95 &    2 &  23098 \\
Interpersonal &   3120 &  4520.74 &  7492.72 &    6 &  79049 \\
\bottomrule
\end{tabular}


[1mScore[0m descriptives

\begin{tabular}{lrrrrr}
\toprule
0 &  count &   mean &  stddev &  min &  max \\
\midrule
Buddhism      &   3120 &   2.01 &    2.19 &   -7 &   24 \\
Economics     &   3120 &   1.47 &    2.70 &   -7 &   61 \\
Fitness       &   3120 &   1.77 &    2.14 &   -6 &   28 \\
Health        &   3120 &   2.05 &    2.06 &   -5 &   27 \\
Interpe

Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "


KeyError: 'fitness'

CPU times: user 3.16 s, sys: 189 ms, total: 3.35 s
Wall time: 9.66 s


# Train/Test Splits

In [19]:
## garbage collector to speed up computation
collected = gc.collect()
print(f'Garbage collector: collected {collected} objects.')

Garbage collector: collected 9303 objects.


In [20]:
%%time
print(datetime.now().time())
%run -i '6a-rand-train-test-split-50.py'
#%run -i '6b-time-train-test-split-50.py'

## check standard deviations of variables
print('\nStandard deviations are:\n')
for i in data_array:
    tr = round( pd.to_numeric(train[i].describe('score').select('score').toPandas().iloc[2][0]), 2 )
    te = round( pd.to_numeric(test[i].describe('score').select('score').toPandas().iloc[2][0]), 2 )
    perc = round( (te/tr - 1)*100, 2 )
    s = i.title() + ' & ' + str(tr) + ' & ' + str(te) + ' & ' + str(perc) + ' \\\\'
    print(s)

15:56:09.322665

Standard deviations are:

Buddhism & 2.1 & 2.27 & 8.1 \\
Economics & 1.86 & 3.34 & 79.57 \\
Fitness & 2.18 & 2.1 & -3.67 \\
Health & 2.11 & 2.01 & -4.74 \\
Interpersonal & 22.37 & 24.96 & 11.58 \\
CPU times: user 83.3 ms, sys: 25.5 ms, total: 109 ms
Wall time: 2.79 s


In [33]:
## garbage collector to speed up computation
collected = gc.collect()
print(f'Garbage collector: collected {collected} objects.')

Garbage collector: collected 3337 objects.


In [34]:
'''
interesting to see how skewed rus_stackoverflow posts are to more posts in recent years
'''

'\ninteresting to see how skewed rus_stackoverflow posts are to more posts in recent years\n'

# Create Results Dictionary

In [56]:
for i in data_array:
    print(i)
    print(datasets[i].sort(col('clean_date')).select('clean_date').take(1)[0][0])
    print(f"{datasets[i].sort(col('clean_date').desc()).select('clean_date').take(1)[0][0]}\n")

buddhism
2016-04-11 21:50:32.343000
2019-06-01 21:26:29.840000

economics
2017-08-25 14:46:57.953000
2019-06-02 00:15:49.433000

fitness
2015-08-14 00:55:15.963000
2019-06-01 17:51:10.030000

health
2016-10-29 22:01:11.947000
2019-06-02 02:25:10.917000

interpersonal
2017-06-27 17:30:02.927000
2019-06-02 02:35:11.207000



In [35]:
RESULTS = {}
for i in data_array:
    # capitalise keys
    RESULTS[i.title()] = {}

# Silly Mean Model

In [36]:
import pyspark.sql.functions as F

## choose target variable
target = 'score'

## create mean dictionaries
y_ravi_tr_means = {}

## calculate the mean of each forum, using ONLY training set
for i in data_array:
    y_ravi_tr_means[i] = train[i].select(target).rdd.flatMap(lambda x: x).mean()

## import rmse evaluator
from pyspark.ml.evaluation import RegressionEvaluator

## create dictionaries for training and testing (baseline) rmse 
base = {}
tr_rmse = {}

## modelling
for i in data_array:

    ## initial variable for timing
    t0 = time.time()
    
    ## train silly mean model by assigning training set mean for training and testing predictions
    train[i] = train[i].withColumn('mean_pred', F.lit(y_ravi_tr_means[i]))
    test[i] = test[i].withColumn('mean_pred', F.lit(y_ravi_tr_means[i]))

    ## evaluate silly mean model, on both training and testing set
    evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='mean_pred')
    tr_rmse[i] = round( evaluator.evaluate(train[i]), 2)
    base[i] = round( evaluator.evaluate(test[i]), 2)

    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m mean\033[0m model is {base[i]}")

    ## record time taken
    timet = round( time.time() - t0, 2 )
    
    ## store as dictionary inside RESULTS dictionary, initiating dataset name entries first
    RESULTS[i.title()]['0silly_mean.0tr_rmse'] = tr_rmse[i]
    RESULTS[i.title()]['0silly_mean.1rmse'] = base[i]
    RESULTS[i.title()]['0silly_mean.2timet'] = timet
    
## record results
show_save_results(RESULTS)

The root-mean-square error of [94mbuddhism's[0m[92m mean[0m model is 2.27
The root-mean-square error of [94meconomics's[0m[92m mean[0m model is 3.34
The root-mean-square error of [94mfitness's[0m[92m mean[0m model is 2.1
The root-mean-square error of [94mhealth's[0m[92m mean[0m model is 2.01
The root-mean-square error of [94minterpersonal's[0m[92m mean[0m model is 24.96


Unnamed: 0,0silly_mean.0tr_rmse,0silly_mean.1rmse,0silly_mean.2timet
Buddhism,2.1,2.27,0.5
Economics,1.86,3.34,0.58
Fitness,2.18,2.1,0.52
Health,2.11,2.01,0.49
Interpersonal,22.36,24.96,0.58


\begin{tabular}{lrrr}
\toprule
{} &  0silly\_mean.0tr\_rmse &  0silly\_mean.1rmse &  0silly\_mean.2timet \\
\midrule
Buddhism      &                  2.10 &               2.27 &                0.50 \\
Economics     &                  1.86 &               3.34 &                0.58 \\
Fitness       &                  2.18 &               2.10 &                0.52 \\
Health        &                  2.11 &               2.01 &                0.49 \\
Interpersonal &                 22.36 &              24.96 &                0.58 \\
\bottomrule
\end{tabular}



In [37]:
RESULTS = {}
for i in data_array:
    # capitalise keys
    RESULTS[i.title()] = {}

# Viewcount Model

In [38]:
%%time
print(datetime.now().time())

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
numic_variables = ['viewcount']
datet_variables = ['clean_date']

## numerical columns
numic_assembler = VectorAssembler(inputCols=numic_variables, outputCol='numic_data') # have to put in single col
standardiser = StandardScaler(inputCol='numic_data', outputCol='numic_data_std')    
numic_pipeline = Pipeline(stages=[numic_assembler, standardiser])

'''## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')'''

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['numic_data'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[numic_pipeline, process_assembler])

########################
##### CHOOSE MODEL #####
########################

## linear regression on just viewcount
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(#maxIter=100, # this doesn't change anything
                      #regParam=0.3, # using regularisation parameter here useless since there is one feature
                      #elasticNetParam=0.8,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='viewcount_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
paramGrid = ParamGridBuilder() \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='viewcount_pred')

## set up cross validation for parameter tuning
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    
    ## fitting on train and predicting on train/test
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
        
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m viewcount\033[0m model is {rmse}")

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['1viewcount.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['1viewcount.1rmse'] = rmse
    RESULTS[i.title()]['1viewcount.2imprv'] = impr
    RESULTS[i.title()]['1viewcount.3timet'] = timet
    
## record results
show_save_results(RESULTS)

15:54:31.698853
The root-mean-square error of [94mbuddhism's[0m[92m viewcount[0m model is 2.03
The root-mean-square error of [94meconomics's[0m[92m viewcount[0m model is 2.55
The root-mean-square error of [94mfitness's[0m[92m viewcount[0m model is 2.13
The root-mean-square error of [94mhealth's[0m[92m viewcount[0m model is 1.98
The root-mean-square error of [94minterpersonal's[0m[92m viewcount[0m model is 12.72


Unnamed: 0,1viewcount.0tr_rmse,1viewcount.1rmse,1viewcount.2imprv,1viewcount.3timet
Buddhism,1.95,2.03,10.57,7.07
Economics,1.7,2.55,23.65,4.36
Fitness,2.06,2.13,-1.43,5.7
Health,2.06,1.98,1.49,5.61
Interpersonal,10.98,12.72,49.04,5.94


\begin{tabular}{lrrrr}
\toprule
{} &  1viewcount.0tr\_rmse &  1viewcount.1rmse &  1viewcount.2imprv &  1viewcount.3timet \\
\midrule
Buddhism      &                 1.95 &              2.03 &              10.57 &               7.07 \\
Economics     &                 1.70 &              2.55 &              23.65 &               4.36 \\
Fitness       &                 2.06 &              2.13 &              -1.43 &               5.70 \\
Health        &                 2.06 &              1.98 &               1.49 &               5.61 \\
Interpersonal &                10.98 &             12.72 &              49.04 &               5.94 \\
\bottomrule
\end{tabular}

CPU times: user 1.34 s, sys: 352 ms, total: 1.7 s
Wall time: 28.8 s


In [39]:
RESULTS = {}
for i in data_array:
    # capitalise keys
    RESULTS[i.title()] = {}

In [40]:
'''
Interesting that there are different improvements of viewcount over mean-only prediciton
'''

'\nInteresting that there are different improvements of viewcount over mean-only prediciton\n'

In [41]:
## garbage collector to speed up computation
collected = gc.collect()
print(f'Garbage collector: collected {collected} objects.')

Garbage collector: collected 1410 objects.


## Count Model

In [42]:
%%time
print(datetime.now().time())

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
numic_variables = ['body_word_cnt', 'titl_word_cnt', 'body_char_cnt', 
                   'titl_char_cnt', 'body_sent_cnt', 'titl_sent_cnt']
datet_variables = ['clean_date']

'''## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')'''

## NUMERICAL columns
numic_assembler = VectorAssembler(inputCols=numic_variables, outputCol='numic_data') # have to put in single col
standardiser = StandardScaler(inputCol='numic_data', outputCol='numic_data_std')    
numic_pipeline = Pipeline(stages=[numic_assembler, standardiser])

## create PROCESSING pipeline
process_assembler = VectorAssembler(inputCols=['numic_data'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[numic_pipeline, process_assembler])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(#maxIter=100,
                      #regParam=1,
                      #elasticNetParam=1,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='counts_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [1e-2, 1.]) \
    .addGrid(lr.regParam, [1e-2, 1.]) \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='counts_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
## create models dict
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m counts\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['2counts.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['2counts.1rmse'] = rmse
    RESULTS[i.title()]['2counts.2imprv'] = impr
    RESULTS[i.title()]['2counts.3timet'] = timet
    RESULTS[i.title()]['2counts.4elastic'] = ela_param
    RESULTS[i.title()]['2counts.5regular'] = reg_param
    
## record results
show_save_results(RESULTS)

15:55:00.667482
The root-mean-square error of [94mbuddhism's[0m[92m counts[0m model is 2.25
The root-mean-square error of [94meconomics's[0m[92m counts[0m model is 3.33
The root-mean-square error of [94mfitness's[0m[92m counts[0m model is 2.09
The root-mean-square error of [94mhealth's[0m[92m counts[0m model is 2.01
The root-mean-square error of [94minterpersonal's[0m[92m counts[0m model is 24.88


Unnamed: 0,2counts.0tr_rmse,2counts.1rmse,2counts.2imprv,2counts.3timet,2counts.4elastic,2counts.5regular
Buddhism,2.08,2.25,0.88,14.04,0.01,0.01
Economics,1.86,3.33,0.3,10.34,0.01,1.0
Fitness,2.17,2.09,0.48,7.6,0.01,1.0
Health,2.09,2.01,-0.0,7.93,1.0,0.01
Interpersonal,22.31,24.88,0.32,14.51,1.0,1.0


\begin{tabular}{lrrrrrr}
\toprule
{} &  2counts.0tr\_rmse &  2counts.1rmse &  2counts.2imprv &  2counts.3timet &  2counts.4elastic &  2counts.5regular \\
\midrule
Buddhism      &              2.08 &           2.25 &            0.88 &           14.04 &              0.01 &              0.01 \\
Economics     &              1.86 &           3.33 &            0.30 &           10.34 &              0.01 &              1.00 \\
Fitness       &              2.17 &           2.09 &            0.48 &            7.60 &              0.01 &              1.00 \\
Health        &              2.09 &           2.01 &           -0.00 &            7.93 &              1.00 &              0.01 \\
Interpersonal &             22.31 &          24.88 &            0.32 &           14.51 &              1.00 &              1.00 \\
\bottomrule
\end{tabular}

CPU times: user 4.82 s, sys: 1.33 s, total: 6.15 s
Wall time: 54.5 s


In [43]:
RESULTS = {}
for i in data_array:
    # capitalise keys
    RESULTS[i.title()] = {}

## Text Model

In [54]:
%%time
print(datetime.now().time())
# SMALL DATASETS
# 18min 32s for 2-CV and GRIDSEARCH

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
textt_variables = ['title', 'clean_body']
datet_variables = ['clean_date']

'''## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')'''

## textual columns
# tokenising text cols with custom transformer
nltk_tokeniser_body = nltkWordPunctTokeniser(
    inputCol='clean_body', outputCol='body_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

nltk_tokeniser_title = nltkWordPunctTokeniser(
    inputCol='title', outputCol='titl_words',  
    stopwords=set(nltk.corpus.stopwords.words('english')))

# count occurence of tokens, i.e. create dfm
cnt_vectrizr_body = CountVectorizer(inputCol='body_words', outputCol='body_raw_feats') #!!! minDF???
cnt_vectrizr_title = CountVectorizer(inputCol='titl_words', outputCol='titl_raw_feats')

# create IDF dfm
idf_body = IDF(inputCol="body_raw_feats", outputCol="body_feats")
idf_title = IDF(inputCol="titl_raw_feats", outputCol="titl_feats")

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['body_feats', 'titl_feats'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[  #inputCols=['datet_data']
    nltk_tokeniser_body, 
    nltk_tokeniser_title,
    cnt_vectrizr_body,
    cnt_vectrizr_title,
    idf_body,
    idf_title,
    process_assembler
])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(#maxIter=100,
                      #regParam=1,
                      #elasticNetParam=1,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='tokens_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [1e-2, 1.]) \
    .addGrid(lr.regParam, [1e-2, 1.]) \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='tokens_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
## create models dict
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m tokens\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['2tokens.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['2tokens.1rmse'] = rmse
    RESULTS[i.title()]['2tokens.2imprv'] = impr
    RESULTS[i.title()]['2tokens.3timet'] = timet
    RESULTS[i.title()]['2tokens.4elastic'] = ela_param
    RESULTS[i.title()]['2tokens.5regular'] = reg_param
    
## record results
show_save_results(RESULTS)

16:00:48.911783
The root-mean-square error of [94mbuddhism's[0m[92m tokens[0m model is 2.27
The root-mean-square error of [94meconomics's[0m[92m tokens[0m model is 3.34
The root-mean-square error of [94mfitness's[0m[92m tokens[0m model is 2.1
The root-mean-square error of [94mhealth's[0m[92m tokens[0m model is 2.01
The root-mean-square error of [94minterpersonal's[0m[92m tokens[0m model is 25.87


Unnamed: 0,2tokens.0tr_rmse,2tokens.1rmse,2tokens.2imprv,2tokens.3timet,2tokens.4elastic,2tokens.5regular,3final.0tr_rmse,3final.1rmse,3final.2imprv,3final.3timet,3final.4elastic,3final.5regular
Buddhism,2.1,2.27,-0.0,221.3,1.0,1.0,2.04,2.24,1.32,11.24,0.01,1.0
Economics,1.86,3.34,-0.0,201.97,1.0,1.0,1.86,3.34,-0.0,8.96,1.0,1.0
Fitness,2.18,2.1,-0.0,190.3,1.0,1.0,2.18,2.1,-0.0,8.26,1.0,1.0
Health,2.11,2.01,-0.0,186.69,1.0,1.0,2.08,2.0,0.5,8.27,0.01,1.0
Interpersonal,15.06,25.87,-3.65,358.41,1.0,1.0,22.26,24.87,0.36,8.44,1.0,1.0


\begin{tabular}{lrrrrrrrrrrrr}
\toprule
{} &  2tokens.0tr\_rmse &  2tokens.1rmse &  2tokens.2imprv &  2tokens.3timet &  2tokens.4elastic &  2tokens.5regular &  3final.0tr\_rmse &  3final.1rmse &  3final.2imprv &  3final.3timet &  3final.4elastic &  3final.5regular \\
\midrule
Buddhism      &              2.10 &           2.27 &           -0.00 &          221.30 &               1.0 &               1.0 &             2.04 &          2.24 &           1.32 &          11.24 &             0.01 &              1.0 \\
Economics     &              1.86 &           3.34 &           -0.00 &          201.97 &               1.0 &               1.0 &             1.86 &          3.34 &          -0.00 &           8.96 &             1.00 &              1.0 \\
Fitness       &              2.18 &           2.10 &           -0.00 &          190.30 &               1.0 &               1.0 &             2.18 &          2.10 &          -0.00 &           8.26 &             1.00 &              1.0 \\
Health      

In [44]:
RESULTS = {}
for i in data_array:
    # capitalise keys
    RESULTS[i.title()] = {}

In [45]:
## check predictions aren't constant
models['health'].transform(test['health']).select('tokens_pred').take(10)

AnalysisException: "cannot resolve '`tokens_pred`' given input columns: [btd[0], index, btd[5], std[4], clean_date, btd[7], btd[3], ttd[3], ttd[2], ttd[1], ttd[0], btd[2], std[2], numic_data_std, std[1], std[5], ttd[8], ttd[5], ttd[7], ttd[9], ttd[6], btd[8], title, body_sent_cnt, btd[9], btd[4], std[7], mean_pred, btd[6], titl_sent_cnt, numic_data, std[3], std[8], score, std[0], titl_word_cnt, ttd[4], body_word_cnt, clean_body, std[6], titl_char_cnt, std[9], btd[1], body_char_cnt, counts_pred, features, viewcount];;\n'Project ['tokens_pred]\n+- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 23 more fields]\n   +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 22 more fields]\n      +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 21 more fields]\n         +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 20 more fields]\n            +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 19 more fields]\n               +- Sample 0.5, 1.0, false, 1777\n                  +- Sort [title#246 ASC NULLS FIRST, viewcount#247L ASC NULLS FIRST, score#248L ASC NULLS FIRST, clean_date#249 ASC NULLS FIRST, clean_body#250 ASC NULLS FIRST, btd[0]#251 ASC NULLS FIRST, btd[1]#252 ASC NULLS FIRST, btd[2]#253 ASC NULLS FIRST, btd[3]#254 ASC NULLS FIRST, btd[4]#255 ASC NULLS FIRST, btd[5]#256 ASC NULLS FIRST, btd[6]#257 ASC NULLS FIRST, btd[7]#258 ASC NULLS FIRST, btd[8]#259 ASC NULLS FIRST, btd[9]#260 ASC NULLS FIRST, std[0]#261 ASC NULLS FIRST, std[1]#262 ASC NULLS FIRST, std[2]#263 ASC NULLS FIRST, std[3]#264 ASC NULLS FIRST, std[4]#265 ASC NULLS FIRST, std[5]#266 ASC NULLS FIRST, std[6]#267 ASC NULLS FIRST, std[7]#268 ASC NULLS FIRST, std[8]#269 ASC NULLS FIRST, ... 18 more fields], false\n                     +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 18 more fields]\n                        +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 19 more fields]\n                           +- Window [row_number() windowspecdefinition(clean_date#249 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS index#102035], [clean_date#249 ASC NULLS FIRST]\n                              +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 17 more fields]\n                                 +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 18 more fields]\n                                    +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 19 more fields]\n                                       +- Window [row_number() windowspecdefinition(clean_date#249 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS index#4754], [clean_date#249 ASC NULLS FIRST]\n                                          +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 17 more fields]\n                                             +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 18 more fields]\n                                                +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 19 more fields]\n                                                   +- Window [row_number() windowspecdefinition(clean_date#249 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS index#3383], [clean_date#249 ASC NULLS FIRST]\n                                                      +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 17 more fields]\n                                                         +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 18 more fields]\n                                                            +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 19 more fields]\n                                                               +- Window [row_number() windowspecdefinition(clean_date#249 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS index#2016], [clean_date#249 ASC NULLS FIRST]\n                                                                  +- Project [title#246, viewcount#247L, score#248L, clean_date#249, clean_body#250, btd[0]#251, btd[1]#252, btd[2]#253, btd[3]#254, btd[4]#255, btd[5]#256, btd[6]#257, btd[7]#258, btd[8]#259, btd[9]#260, std[0]#261, std[1]#262, std[2]#263, std[3]#264, std[4]#265, std[5]#266, std[6]#267, std[7]#268, std[8]#269, ... 17 more fields]\n                                                                     +- Relation[title#246,viewcount#247L,score#248L,clean_date#249,clean_body#250,btd[0]#251,btd[1]#252,btd[2]#253,btd[3]#254,btd[4]#255,btd[5]#256,btd[6]#257,btd[7]#258,btd[8]#259,btd[9]#260,std[0]#261,std[1]#262,std[2]#263,std[3]#264,std[4]#265,std[5]#266,std[6]#267,std[7]#268,std[8]#269,... 17 more fields] parquet\n"

In [None]:
"""why the heck does everything besides interpersonal have constant predictions - it's not the parameters or the size of the data"""
"""it's the number of questions in the data"""

# LDA Features Model

In [46]:
%%time
print(datetime.now().time())

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
numic_variables = ['btd[0]', 'btd[1]', 'btd[2]', 'btd[3]', 'btd[4]', 
                   'btd[5]', 'btd[6]', 'btd[7]', 'btd[8]', 'btd[9]', 
                   'std[0]', 'std[1]', 'std[2]', 'std[3]', 'std[4]', 
                   'std[5]', 'std[6]', 'std[7]', 'std[8]', 'std[9]',
                   'ttd[0]', 'ttd[1]', 'ttd[2]', 'ttd[3]', 'ttd[4]', 
                   'ttd[5]', 'ttd[6]', 'ttd[7]', 'ttd[8]', 'ttd[9]'] 

datet_variables = ['clean_date']

'''## date columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')'''

## numerical columns
numic_assembler = VectorAssembler(inputCols=numic_variables, outputCol='numic_data') # have to put in single col
standardiser = StandardScaler(inputCol='numic_data', outputCol='numic_data_std')    
numic_pipeline = Pipeline(stages=[numic_assembler, standardiser])

## create processing pipeline
process_assembler = VectorAssembler(inputCols=['numic_data'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[numic_pipeline, process_assembler])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(#maxIter=100,
                      #regParam=1,
                      #elasticNetParam=1,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='topics_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
    
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [1e-2, 1.]) \
    .addGrid(lr.regParam, [1e-2, 1.]) \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='topics_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
## create models dict
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m topics\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['2topics.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['2topics.1rmse'] = rmse
    RESULTS[i.title()]['2topics.2imprv'] = impr
    RESULTS[i.title()]['2topics.3timet'] = timet
    RESULTS[i.title()]['2topics.4elastic'] = ela_param
    RESULTS[i.title()]['2topics.5regular'] = reg_param
    
## record results
show_save_results(RESULTS)

15:56:37.768309
The root-mean-square error of [94mbuddhism's[0m[92m topics[0m model is 2.25
The root-mean-square error of [94meconomics's[0m[92m topics[0m model is 3.34
The root-mean-square error of [94mfitness's[0m[92m topics[0m model is 2.1
The root-mean-square error of [94mhealth's[0m[92m topics[0m model is 2.01
The root-mean-square error of [94minterpersonal's[0m[92m topics[0m model is 24.95


Unnamed: 0,2topics.0tr_rmse,2topics.1rmse,2topics.2imprv,2topics.3timet,2topics.4elastic,2topics.5regular
Buddhism,2.05,2.25,0.88,11.36,0.01,1.0
Economics,1.86,3.34,-0.0,9.98,1.0,1.0
Fitness,2.18,2.1,-0.0,12.33,1.0,1.0
Health,2.11,2.01,-0.0,14.49,1.0,1.0
Interpersonal,22.31,24.95,0.04,11.47,1.0,1.0


\begin{tabular}{lrrrrrr}
\toprule
{} &  2topics.0tr\_rmse &  2topics.1rmse &  2topics.2imprv &  2topics.3timet &  2topics.4elastic &  2topics.5regular \\
\midrule
Buddhism      &              2.05 &           2.25 &            0.88 &           11.36 &              0.01 &               1.0 \\
Economics     &              1.86 &           3.34 &           -0.00 &            9.98 &              1.00 &               1.0 \\
Fitness       &              2.18 &           2.10 &           -0.00 &           12.33 &              1.00 &               1.0 \\
Health        &              2.11 &           2.01 &           -0.00 &           14.49 &              1.00 &               1.0 \\
Interpersonal &             22.31 &          24.95 &            0.04 &           11.47 &              1.00 &               1.0 \\
\bottomrule
\end{tabular}

CPU times: user 5.88 s, sys: 1.5 s, total: 7.38 s
Wall time: 59.7 s


In [47]:
RESULTS = {}
for i in data_array:
    # capitalise keysa
    RESULTS[i.title()] = {}

# Final Model

In [48]:
%%time
print(datetime.now().time())

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF, StandardScaler, VectorAssembler, VectorSlicer

########################
##### CHOOSE FEATS ##### can't get date right
########################

## define features to predict on
target = 'score'
numic_variables = ['btd[0]', 'btd[1]', 'btd[2]', 'btd[3]', 'btd[4]', 
                   'btd[5]', 'btd[6]', 'btd[7]', 'btd[8]', 'btd[9]', 
                   'std[0]', 'std[1]', 'std[2]', 'std[3]', 'std[4]', 
                   'std[5]', 'std[6]', 'std[7]', 'std[8]', 'std[9]',
                   'ttd[0]', 'ttd[1]', 'ttd[2]', 'ttd[3]', 'ttd[4]', 
                   'ttd[5]', 'ttd[6]', 'ttd[7]', 'ttd[8]', 'ttd[9]',
                   'body_word_cnt', 'titl_word_cnt', 'body_char_cnt', 
                   'titl_char_cnt', 'body_sent_cnt', 'titl_sent_cnt']
datet_variables = ['clean_date']

### DATE columns
datet_assembler = VectorAssembler(inputCols=datet_variables, outputCol='datet_data')

### NUMERICAL columns
numic_assembler = VectorAssembler(inputCols=numic_variables, outputCol='numic_data') # have to put in single col
standardiser = StandardScaler(inputCol='numic_data', outputCol='numic_data_std')    
numic_pipeline = Pipeline(stages=[numic_assembler, standardiser])


## create PROCESSING pipeline
process_assembler = VectorAssembler(inputCols=['numic_data'], #inputCols=['datet_data']
                                    outputCol='features') 
process_pipeline = Pipeline(stages=[  #inputCols=['datet_data']
    numic_pipeline,
    process_assembler
])

########################
##### CHOOSE MODEL #####
########################

## linear regression model
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
    
lr = LinearRegression(#maxIter=100,
                      #regParam=1,
                      #elasticNetParam=1,
                      featuresCol='features',
                      labelCol=target,
                      predictionCol='finalm_pred')

## make final pipeline
final_pipeline = Pipeline(stages=[process_pipeline, lr])

## import methods for tuning
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## set up grid for parameter tuning: 
'''NEEDED, BUT IMMENSELY SLOWING DOWN'''
# Ravi et al use L2, aka ridge, aka elasticNetParam=0
# regParam is the value of lambda
    
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [1e-2, 1.]) \
    .addGrid(lr.regParam, [1e-2, 1.]) \
    .build()

## set up rmse evaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol=target, predictionCol='finalm_pred')

## set up cross validation for parameter tuning
'''DEFINITELY SLOWING DOWN'''
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
## create models dict
models = {}

## modelling
for i in data_array:
    
    ## initial variable for timing
    t0 = time.time()
    
    ## fit on training set with CV
    cvmodel = crossval.fit(train[i])
    models[i] = cvmodel
    
    ## predict and evaluate
    tr_rmse = round( evaluator.evaluate(cvmodel.transform(train[i])), 2 )
    rmse = round( evaluator.evaluate(cvmodel.transform(test[i])), 2 )
    print(f"The root-mean-square error of \033[94m{i}'s\033[0m\033[92m final\033[0m model is {rmse}")
    
    ## get params
    # elasticnet
    ela_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[1]
    ela_param = cvmodel.bestModel.stages[-1].extractParamMap()[ela_key]
    # reg'sation
    reg_key = list(cvmodel.bestModel.stages[-1].extractParamMap().keys())[9]
    reg_param = cvmodel.bestModel.stages[-1].extractParamMap()[reg_key]

    ## calculate improvement over median baseline
    impr = round( (rmse/base[i] - 1)*-100, 2 )
    
    ## record time taken
    timet = round( time.time() - t0, 2 )

    ## store as dictionary inside RESULTS dictionary
    RESULTS[i.title()]['3final.0tr_rmse'] = tr_rmse
    RESULTS[i.title()]['3final.1rmse'] = rmse
    RESULTS[i.title()]['3final.2imprv'] = impr
    RESULTS[i.title()]['3final.3timet'] = timet
    RESULTS[i.title()]['3final.4elastic'] = ela_param
    RESULTS[i.title()]['3final.5regular'] = reg_param
    
## record results
show_save_results(RESULTS)

15:57:37.517277
The root-mean-square error of [94mbuddhism's[0m[92m final[0m model is 2.24
The root-mean-square error of [94meconomics's[0m[92m final[0m model is 3.34
The root-mean-square error of [94mfitness's[0m[92m final[0m model is 2.1
The root-mean-square error of [94mhealth's[0m[92m final[0m model is 2.0
The root-mean-square error of [94minterpersonal's[0m[92m final[0m model is 24.87


Unnamed: 0,3final.0tr_rmse,3final.1rmse,3final.2imprv,3final.3timet,3final.4elastic,3final.5regular
Buddhism,2.04,2.24,1.32,11.24,0.01,1.0
Economics,1.86,3.34,-0.0,8.96,1.0,1.0
Fitness,2.18,2.1,-0.0,8.26,1.0,1.0
Health,2.08,2.0,0.5,8.27,0.01,1.0
Interpersonal,22.26,24.87,0.36,8.44,1.0,1.0


\begin{tabular}{lrrrrrr}
\toprule
{} &  3final.0tr\_rmse &  3final.1rmse &  3final.2imprv &  3final.3timet &  3final.4elastic &  3final.5regular \\
\midrule
Buddhism      &             2.04 &          2.24 &           1.32 &          11.24 &             0.01 &              1.0 \\
Economics     &             1.86 &          3.34 &          -0.00 &           8.96 &             1.00 &              1.0 \\
Fitness       &             2.18 &          2.10 &          -0.00 &           8.26 &             1.00 &              1.0 \\
Health        &             2.08 &          2.00 &           0.50 &           8.27 &             0.01 &              1.0 \\
Interpersonal &            22.26 &         24.87 &           0.36 &           8.44 &             1.00 &              1.0 \\
\bottomrule
\end{tabular}

CPU times: user 4.93 s, sys: 1.36 s, total: 6.29 s
Wall time: 45.3 s


In [52]:
## check predictions aren't constant
models['fitness'].transform(test['fitness']).select('finalm_pred').take(10)

[Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052),
 Row(finalm_pred=1.8051282051282052)]

In [None]:
fin

# Save predictions

In [None]:
(trained_pipeline
 .transform(datasets['english'])
 .select(
    indep_text_variables + ["prediction"]
 )
 .write
 .parquet("linreg_prediction.parquet")
)

In [None]:
linreg_predictions = spark.read.parquet("linreg_prediction.parquet")

In [None]:
linreg_predictions.toPandas().head()

In [None]:
linreg_predictions.select("prediction").describe().toPandas()

# Save pipelines

In [None]:
from joblib import dump, load
dump(estimator_pipeline, 'pipeline.joblib') 

reloaded = load("pipeline.joblib")

#Now we can predict directly!

reloaded.predict(X)[:10]

In [None]:
## save models DOESN'T WORK BECAUSE: 'NLTKWordPunctTokenizer' object has no attribute '_to_java'
for i in data_array:
    param_dict[i].save(f'{i}-pipeline') 

# Convert notebook to python file

In [None]:
!jupyter nbconvert --to script 0-master-notebook-pipelines.ipynb