# 1. Package Importing



In [1]:
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 43 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 47.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=2d3a3f5a755b1a92d541eb04ebf09a0453d789f7d158c3ff0cb74de83dc6f77c
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
import pandas as pd
#create spark session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. Data visualization

In [4]:
text_df= spark.read.csv("/content/drive/MyDrive/cs631_final_project/data/kindle_reviews.csv",inferSchema=True,header=True,sep=',')
#text_df= spark.read.csv("/content/drive/MyDrive/cs631/kindle_reviews.csv",inferSchema=True,header=True,sep=',')
df_raw = text_df.select('overall','reviewTime','helpful','reviewText')
#.limit(1000)
df_raw = df_raw.dropna(how='any')

In [5]:
df_raw.printSchema()

root
 |-- overall: integer (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- helpful: string (nullable = true)
 |-- reviewText: string (nullable = true)



In [6]:
df_raw.show(5)

+-------+-----------+-------+--------------------+
|overall| reviewTime|helpful|          reviewText|
+-------+-----------+-------+--------------------+
|      5| 05 5, 2014| [0, 0]|I enjoy vintage b...|
|      4| 01 6, 2014| [2, 2]|This book is a re...|
|      4| 04 4, 2014| [2, 2]|This was a fairly...|
|      5|02 19, 2014| [1, 1]|I'd never read an...|
|      4|03 19, 2014| [0, 1]|If you like perio...|
+-------+-----------+-------+--------------------+
only showing top 5 rows



In [7]:
from pyspark.sql.functions import *

In [8]:
# delete rows with other date patterns
pattern = '^\d+ \d+, \d{4}$'
df_raw = df_raw.withColumn('reviewTime', regexp_extract('reviewTime', pattern, 0))
df_raw = df_raw.filter(df_raw['reviewTime']!='')
#df_raw = df_raw.where(regexp_extract('reviewTime', pattern, 0).isNotNull())
#df_raw.show(5)
df_raw = df_raw.groupby('reviewTime').agg(avg("overall").alias("avg_rating")).orderBy("reviewTime")
df_raw.show(5)

+----------+------------------+
|reviewTime|        avg_rating|
+----------+------------------+
|01 1, 2010|               3.0|
|01 1, 2011| 3.909090909090909|
|01 1, 2012|         3.9296875|
|01 1, 2013|4.3095854922279795|
|01 1, 2014| 4.314896291640478|
+----------+------------------+
only showing top 5 rows



In [9]:
df_pd = df_raw.toPandas()

In [10]:
# fit with specific format
df_pd['reviewTime'] = pd.to_datetime(df_pd['reviewTime'], format='%m %d, %Y', errors = 'coerce')
df_pd = df_pd.dropna().sort_values(by = ['reviewTime'])

In [11]:
pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
df_pd.reviewTime

484    2000-03-05
686    2000-04-09
1042   2000-06-06
1232   2000-07-07
1728   2000-11-11
          ...    
1134   2014-07-19
1147   2014-07-20
1154   2014-07-21
1160   2014-07-22
1166   2014-07-23
Name: reviewTime, Length: 2038, dtype: datetime64[ns]

In [13]:
df_pd.avg_rating

484     5.000000
686     5.000000
1042    5.000000
1232    4.000000
1728    5.000000
          ...   
1134    4.479143
1147    4.477612
1154    4.492108
1160    4.466408
1166    4.517986
Name: avg_rating, Length: 2038, dtype: float64

In [14]:
import plotly.graph_objects as go

import pandas as pd

# Load data
# df = pd.read_csv(
#    "https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv")
# df.columns = [col.replace("AAPL.", "") for col in df.columns]


# Create figure
fig = go.Figure()

fig.add_trace(
    go.Scatter(x=list(df_pd.reviewTime), y=list(df_pd.avg_rating)))

# Set title
fig.update_layout(
    title_text="Time series with range slider and selectors"
)

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)
fig.write_html("interactive_plot.html")
fig.show()