In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from datetime import datetime

# Startup

- Anthony
  - Linear Regression
  - Analytic goal: Assess whether viewership is affected by public sentiment
- William
  - Random Forest
  - Analytic goal: Assess whether viewership is affected by public sentiment
- Lawrence
  - Gradient Boosted Trees 
  - Analytic goal: Assess whether viewership is affected by public sentiment

In [0]:
def disp_normalize_mse(mse, divisor):
    print(f"Normalized MSE: {mse/divisor:.3f}")

In [0]:
import time
start = time.time()

In [0]:
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

In [0]:
ss = SparkSession.builder.appName("rogan_sentiment_analysis")\
                            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
                            .config("spark.network.timeout", "36000000s")\
                            .config("spark.executor.heartbeatInterval", "3600s")\
                            .getOrCreate()
sc = spark.sparkContext

In [0]:
aws_access_key = ACCESS_KEY
aws_secret_key = SECRET_KEY
spark._jsc.hadoopConfiguration().set(
    "spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1"
)
spark._jsc.hadoopConfiguration().set(
    "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
)
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)

In [0]:
database = DATABASE
collection = COLLECTION
user_name = USER_NAME
password = PASSWORD
address = ADDRESS
connection_string = (
    f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
)

In [0]:
rogan_schema = StructType(
    [
        StructField("week", DateType(), False),
        StructField("views_gained", IntegerType(), False),
        StructField("date", DateType(), False),
        StructField("text", StringType(), True),
        StructField("likes", IntegerType(), True),
        StructField("text_length", IntegerType(), True),
        StructField("hf_emot_label", StringType(), True),
        StructField("vader_sentiment_score", FloatType(), True)
    ]
)

In [0]:
database = DATABASE
collection = COLLECTION
user_name = USER_NAME
password = PASSWORD
address = ADDRESS
connection_string = (
    f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
)

In [0]:
df = spark.read.format("mongo").option("uri", connection_string).load()
df.cache()
df.show()

In [0]:
df = df.drop("_id")

In [0]:
df.printSchema()

In [0]:
view_mean = df.agg({"views_gained":"mean"})

In [0]:
view_mean = view_mean.collect()

In [0]:
view_mean = list(view_mean[0].asDict().values())[0]

In [0]:
df.show()

In [0]:
df = df.drop('week').drop('date').drop('text')

In [0]:
df.show()

In [0]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
df_split = df.randomSplit([0.8, 0.2], 1)
df_train = df_split[0].cache()
df_test = df_split[1].cache()

In [0]:
df_train.show()

In [0]:
cat_cols = ['hf_emot_label']

for col in cat_cols:
    si = StringIndexer(inputCol=col, outputCol=col+"-onehot").setHandleInvalid("keep")
    sm = si.fit(df_train)
    df_train = sm.transform(df_train).drop(col)
    df_train = df_train.withColumnRenamed(col+"-onehot", col)
    df_test = sm.transform(df_test).drop(col)
    df_test = df_test.withColumnRenamed(col+"-onehot", col)

In [0]:
sm.labels

In [0]:
df_train.show()

In [0]:
for col in cat_cols:
    oi = OneHotEncoder(inputCol=col, outputCol=col+"-onehot")
    om = oi.fit(df_train)
    df_train = om.transform(df_train).drop(col)
    df_train = df_train.withColumnRenamed(col+"-onehot", col)
    df_test = om.transform(df_test).drop(col)
    df_test = df_test.withColumnRenamed(col+"-onehot", col)

In [0]:
om.categorySizes

In [0]:
df_train.show()

In [0]:
input_cols = df_train.columns
input_cols.remove('views_gained')
va = VectorAssembler(outputCol="features", inputCols=input_cols)

In [0]:
input_cols

In [0]:
df_train = va.transform(df_train).select(
    "features", "views_gained").withColumnRenamed("views_gained", "label")
df_test = va.transform(df_test).select(
    "features", "views_gained").withColumnRenamed("views_gained", "label")

In [0]:
df_train.show(truncate=False)

In [0]:
rf = RandomForestRegressor(maxDepth=3, numTrees=20, minInstancesPerNode=2)
rfmodel = rf.fit(df_train)
rfpredicts = rfmodel.transform(df_test)
metric_name = "mse"
metrics = RegressionEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

rf_mse = metrics.evaluate(rfpredicts)
disp_normalize_mse(rf_mse, view_mean)

In [0]:
rfpredicts.show()

In [0]:
rfmodel.featureImportances

# Gradient boosted MSE

In [0]:
gb = GBTRegressor()
gb = GBTRegressor(maxDepth=2, maxIter=10, stepSize=0.01)
gbmodel = gb.fit(df_train)
gbpredicts = gbmodel.transform(df_test)
metric_name = "mse"
metrics = RegressionEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

gb_mse = metrics.evaluate(gbpredicts)
disp_normalize_mse(gb_mse, view_mean)

In [0]:
gbmodel.featureImportances 

# Linear Regression

In [0]:
df_train.show(3)

In [0]:
lr = LinearRegression()
lrmodel = lr.fit(df_train)

trainingSummary = lrmodel.summary

In [0]:
lrmodel.coefficients

In [0]:
lr_mse = trainingSummary.meanSquaredError

In [0]:
disp_normalize_mse(lr_mse, view_mean)

In [0]:
end = time.time()
print(f"{end - start:.3f} seconds to train and evaluate models")

In [0]:
sc.stop()
ss.stop()