# Importing Libraries and Spark Session Initialisation

In [1]:
import pandas as pd
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
import datetime

import plotly.graph_objects as go

import plotly.graph_objs as go
import plotly.subplots

from __future__ import print_function

import sys

# from pyspark import SparkContext, SQLContext, SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer

from pymongo import MongoClient
import time
import requests
import json

import os
import webbrowser


spark = SparkSession.builder \
      .appName("Word count") \
      .getOrCreate()



# Scraping Review Data

In [2]:
start_time = time.time()
game_id='945360' # sample game id given here, refer steam website for appropriate game id
qwe=requests.get('https://store.steampowered.com/appreviewhistogram/'+game_id).content
ress = json.loads(qwe.decode("utf-8")) 
date_rec=[]
up_rec=[]
down_rec=[]
date_roll=[]
up_roll=[]
down_roll=[]
for i in ress['results']['recent']:
    date_rec.append(i['date'])
    up_rec.append(i['recommendations_up'])
    down_rec.append(i['recommendations_down'])
for i in ress['results']['rollups']:
    date_roll.append(i['date'])
    up_roll.append(i['recommendations_up'])
    down_roll.append(i['recommendations_down'])
    
    
rec_up_dataa = pd.DataFrame(list(zip(date_rec, up_rec)), 
               columns =['date', 'up']) 
rec_down_dataa = pd.DataFrame(list(zip(date_rec, down_rec)), 
               columns =['date', 'up']) 
roll_up_dataa = pd.DataFrame(list(zip(date_roll, up_roll)), 
               columns =['date', 'up']) 
roll_down_dataa = pd.DataFrame(list(zip(date_roll, down_roll)), 
               columns =['date', 'up']) 

print("--- %s seconds ---" % (time.time() - start_time))


--- 1.0612494945526123 seconds ---


# Fetching Relevant Data from MongoDB Atlas Cluster 

In [3]:
start_time = time.time()
client = MongoClient("MongoDB Collection URL Here") # Collection URL Not specified for Data Security
db = client.get_database('steam_data')
coll=db.game_data
cur=coll.find_one({"url_info.id":game_id},{"name":1})
game_name=cur['name']
print(game_name)

'Among Us'

# Linear and Decision Tree Regression Implementation 

In [4]:
def lin_reg_plot(spark, dfname):
    data = spark.createDataFrame(dfname)
    data2 =data.select(data.date,data.up.alias('label'))
    train=data2
    test=data2
    assembler=VectorAssembler().setInputCols(['date',]).setOutputCol('features')
    train01 = assembler.transform(train)
    train02 = train01.select("features","label")
    lr = LinearRegression()
    model = lr.fit(train02)
    test01 = assembler.transform(test)
    test02 = test01.select('features', 'label')
    test03 = model.transform(test02)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(test02)

    trainingData=test02
    testData=test02
    dt = DecisionTreeRegressor(maxDepth=2,featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[featureIndexer, dt])

    model = pipeline.fit(trainingData)
    predictions = model.transform(testData)

    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("RMSE on test data = %g" % rmse)
    pipeline = Pipeline(stages=[featureIndexer, dt])

    model = pipeline.fit(test02)
    predictions = model.transform(test02)
    p1 = go.Scatter(x=[datetime.datetime.fromtimestamp(i.__getitem__("date")) for i in data.select(col("date")).collect()],
                    y=list(data.select('up').toPandas()['up']), 
                    mode='markers',
                    marker=dict(size=6,color='darkorange',line=dict(width=2,color='DarkSlateGrey')),
                    name="Data")

    p2 = go.Scatter(x=[datetime.datetime.fromtimestamp(i.__getitem__("date")) for i in data.select(col("date")).collect()],
                    y=list(predictions.select('prediction').toPandas()['prediction']), 
                    mode='lines',
                    line=dict(color="cornflowerblue"),
                    name="DTR Predicted")
    p3= go.Scatter(
            x=[datetime.datetime.fromtimestamp(i.__getitem__("date")) for i in data.select(col("date")).collect()],
            y=list(test03.select('prediction').toPandas()['prediction']),
            mode='lines',
            line=dict(color="yellowgreen"),
            name='LR Predicted'
        )
    fig = go.Figure(data=[p1, p2, p3])
    return([fig, p1, p2, p3])


# Plot Creation

In [5]:
def final_lin_reg(spark, df_list, game_name, final_file_name):
    plot4=lin_reg_plot(spark, df_list[0])
    plot3=lin_reg_plot(spark, df_list[1])
    plot2=lin_reg_plot(spark, df_list[2])
    plot1=lin_reg_plot(spark, df_list[3])
    fig = plotly.subplots.make_subplots(rows=2,cols=2, subplot_titles= ['All-Time Positive reviews',
                                                                  'All-Time Negative reviews',
                                                                  'Recent Positive reviews',
                                                                  'Recent Negative reviews'])
    fig.append_trace(plot1[1],1,1)
    fig.append_trace(plot1[2],1,1)
    fig.append_trace(plot1[3],1,1)
    fig.append_trace(plot2[1],1,2)
    fig.append_trace(plot2[2],1,2)
    fig.append_trace(plot2[3],1,2)
    fig.append_trace(plot3[1],2,1)
    fig.append_trace(plot3[2],2,1)
    fig.append_trace(plot3[3],2,1)
    fig.append_trace(plot4[1],2,2)
    fig.append_trace(plot4[2],2,2)
    fig.append_trace(plot4[3],2,2)
    fig.update_layout(title_text=game_name, template="plotly_dark")
    fig.write_html(final_file_name)

# Final Step
    - plotting and opening plot in browser

In [7]:
df_list=[roll_up_dataa, roll_down_dataa, rec_up_dataa, rec_down_dataa]
final_lin_reg(spark, df_list, game_name+' User Trend Analysis', "final_regression.html")
webbrowser.open('file://' + os.path.realpath('final_regression.html'))


Root Mean Squared Error (RMSE) on test data = 6018.77
Root Mean Squared Error (RMSE) on test data = 407.034
Root Mean Squared Error (RMSE) on test data = 980.181
Root Mean Squared Error (RMSE) on test data = 24.6779
