In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np
import pandas as pd


# Preprocessing 

In [7]:
#create py_spark context
spark = SparkSession.builder.appName("Amazon Reviews").\
config("spark.memory.offHeap.enabled","true").\
config("spark.memory.offHeap.size","10g").\
getOrCreate()

24/04/19 11:02:35 WARN Utils: Your hostname, Bryan resolves to a loopback address: 127.0.1.1; using 172.20.218.41 instead (on interface eth0)
24/04/19 11:02:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/19 11:02:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
#import data
df = spark.read.csv('data/amazon_reviews.csv', header = True)

                                                                                

In [9]:
df.show(5,0)

+---+------------+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------+-----------+----------+----------+------------------+--------------------+------------------+
|_c0|reviewerName|overall|reviewText                                                                                                                                                                                                                                                                                                                                          |reviewTime|day_diff|helpful_yes|helpful_no|total_vote|score_pos_neg_diff|score_average_rating|wilson_lower_bound|
+---+------------+-------+------------

24/04/19 11:02:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , reviewerName, overall, reviewText, reviewTime, day_diff, helpful_yes, helpful_no, total_vote, score_pos_neg_diff, score_average_rating, wilson_lower_bound
 Schema: _c0, reviewerName, overall, reviewText, reviewTime, day_diff, helpful_yes, helpful_no, total_vote, score_pos_neg_diff, score_average_rating, wilson_lower_bound
Expected: _c0 but found: 
CSV file: file:///home/bryanhurtado/projects/Sentiment_analysis_amazon_reviews/data/amazon_reviews.csv


In [10]:
#count the total of information
df.count()

4915

In [11]:
#review columns and types
df.columns

['_c0',
 'reviewerName',
 'overall',
 'reviewText',
 'reviewTime',
 'day_diff',
 'helpful_yes',
 'helpful_no',
 'total_vote',
 'score_pos_neg_diff',
 'score_average_rating',
 'wilson_lower_bound']

In [12]:
len(df.columns)

12

In [13]:
df.dtypes

[('_c0', 'string'),
 ('reviewerName', 'string'),
 ('overall', 'string'),
 ('reviewText', 'string'),
 ('reviewTime', 'string'),
 ('day_diff', 'string'),
 ('helpful_yes', 'string'),
 ('helpful_no', 'string'),
 ('total_vote', 'string'),
 ('score_pos_neg_diff', 'string'),
 ('score_average_rating', 'string'),
 ('wilson_lower_bound', 'string')]

In [14]:
#delete columns which are not neccesary
columns_to_drop = ["_c0","reviewerName","reviewTime","day_diff",
                   "helpful_yes","helpful_no","total_vote",
                   "score_pos_neg_diff","score_average_rating",
                  "wilson_lower_bound"]
df = df.drop(*columns_to_drop)

In [15]:
df.show(5,0)

+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|overall|reviewText                                                                                                                                                                                                                                                                                                                                          |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Review scores

In [16]:
#group by scores
df.groupby("overall").agg(count("reviewText")).show()

+--------------------+-----------------+
|             overall|count(reviewText)|
+--------------------+-----------------+
|                 1.0|              244|
|          Luthier"""|                1|
|         Cut Once"""|                1|
|             Kate"""|                1|
| as you have been...|                1|
|                 5.0|             3906|
|        newer car"""|                1|
|           Author"""|                1|
|                 4.0|              526|
|but love books......|                1|
| No Longer the On...|                1|
|          Realtor"""|                1|
|          we hav..."|                1|
|                 2.0|               80|
|                   "|                1|
|  shopaholic at ..."|                1|
|               MD"""|                1|
|              me?"""|                1|
|                K"""|                1|
|                 3.0|              142|
+--------------------+-----------------+
only showing top

In [17]:
#delete the ones with mistakes on the overall column
df = df.filter(col("overall").isin([1.0,2.0,3.0,4.0,5.0]))

In [18]:
df.show(5,0)

+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|overall|reviewText                                                                                                                                                                                                                                                                                                                                          |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [19]:
df.groupby("overall").agg(count("reviewText")).show()

+-------+-----------------+
|overall|count(reviewText)|
+-------+-----------------+
|    1.0|              244|
|    5.0|             3906|
|    4.0|              526|
|    2.0|               80|
|    3.0|              142|
+-------+-----------------+



## Review Texts

In [20]:
#examples
df.show(2,0)

+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
|overall|reviewText                                                                                                                                                    |
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
|4.0    |No issues.                                                                                                                                                    |
|5.0    |Purchased this for my device, it worked as advertised. You can never have too much phone memory, since I download a lot of stuff this was a no brainer for me.|
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------

## Save Dataset for training

In [21]:
df = df.toPandas()

In [22]:
df.head(5)

Unnamed: 0,overall,reviewText
0,4.0,No issues.
1,5.0,"Purchased this for my device, it worked as adv..."
2,4.0,it works as expected. I should have sprung for...
3,5.0,This think has worked out great.Had a diff. br...
4,5.0,"Bought it with Retail Packaging, arrived legit..."


In [26]:
df.isnull().value_counts()

overall  reviewText
False    False         4898
         True             1
Name: count, dtype: int64

In [28]:
df = df.dropna()

In [29]:
df.isnull().value_counts()

overall  reviewText
False    False         4898
Name: count, dtype: int64

In [30]:
df.to_csv('ml_data/amazon_ml.csv', index=False)  