## Exploration of map reduce result

---

### Import Libraries

In [1]:
# import libraries
import findspark
import pandas as pd
import pyspark as ps
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession


### Initialize Spark

In [2]:
# Locate the spark installation
findspark.init()

# Initialize a SparkContext
spark = SparkSession.builder.appName("data_cleaning").getOrCreate()
spark.stop()
sc = ps.SparkContext(appName="prior_analysis")
# Initialize the Session
spark_session = ps.sql.SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/08 12:16:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Connect and import data from HDFS directly into a Spark DataFrame

In [3]:
# Title, description, authors, publisher, publishedDate, categories, Price, User_id, profileName, review/helpfulness, review/score, review/time, review/summary, review/text

joined_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
    ])

df_join_result = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/joined_tables', schema=joined_schema, sep='\t')
df_join_result.printSchema()
df_join_result.describe().show()
df_join_result.show(5)

root
 |-- Title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publishedDate: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- User_id: integer (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: float (nullable = true)
 |-- review/time: integer (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)



23/09/08 12:16:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+--------------------+--------------------+------------------+--------------------+------------------+--------------------+------------------+-------+------------+------------------+------------------+--------------------+--------------------+--------------------+
|summary|               Title|         description|           authors|           publisher|     publishedDate|          categories|             Price|User_id| profileName|review/helpfulness|      review/score|         review/time|      review/summary|         review/text|
+-------+--------------------+--------------------+------------------+--------------------+------------------+--------------------+------------------+-------+------------+------------------+------------------+--------------------+--------------------+--------------------+
|  count|                1898|                1855|              1893|                1790|              1898|                1898|              1898|      0|        1754|          

In [4]:
tmp = df_join_result.limit(300).toPandas()
tmp.head(300)

# Title, description, authors, publisher, publishedDate, categories, Price, User_id, profileName, review/helpfulness, review/score, review/time, review/summary, review/text


Unnamed: 0,Title,description,authors,publisher,publishedDate,categories,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,"@Large, Vol. 1","An Internet cafe and recording studio, known a...",['Ahmed Hoke'],TOKYOPOP,2019-12-20,['Comics & Graphic Novels'],9.99,,"Del Keyes ""Elaborate Chattering Nut""",3/5,3.0,1072828800,"Talk is cheap, punk, and so is reading @large!",Pro illustrator Ahmed Hoke has really got my a...
1,"@Large, Vol. 1","An Internet cafe and recording studio, known a...",['Ahmed Hoke'],TOKYOPOP,2019-12-20,['Comics & Graphic Novels'],9.99,,,0/1,5.0,1073001600,Hip Hop and Art,@Large is an intellectual and artistic explora...
2,All Wired Up: Wire Techniques For the Beadwork...,The step-by-step techniques featured in this r...,['Mark Lareau'],Interweave,2000-10-01,['Crafts & Hobbies'],14.93,,D. Riley,0/0,4.0,1185494400,All Wired Up,I like this book. The author put a lot of work...
3,All Wired Up: Wire Techniques For the Beadwork...,The step-by-step techniques featured in this r...,['Mark Lareau'],Interweave,2000-10-01,['Crafts & Hobbies'],14.93,,SLG,14/15,5.0,1127952000,This book is great!!,I absolutely love this book! It has everything...
4,All Wired Up: Wire Techniques For the Beadwork...,The step-by-step techniques featured in this r...,['Mark Lareau'],Interweave,2000-10-01,['Crafts & Hobbies'],14.93,,,11/11,5.0,1042761600,Excellent book,"What a great instructive book -- clear, concis..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,23 Minutes In Hell: One Man's Story About What...,New York Times Best Seller and Over 1 million ...,['Bill Wiese'],Charisma Media,2010-09-24,['Religion'],10.39,,T. Wall,1/2,5.0,1233532800,Yes Hell is Real,This book which expresses the experience of on...
296,23 Minutes In Hell: One Man's Story About What...,New York Times Best Seller and Over 1 million ...,['Bill Wiese'],Charisma Media,2010-09-24,['Religion'],10.39,,T. Phelps,1/2,5.0,1245974400,Awesome,I received this book in a timely manner. I had...
297,23 Minutes In Hell: One Man's Story About What...,New York Times Best Seller and Over 1 million ...,['Bill Wiese'],Charisma Media,2010-09-24,['Religion'],10.39,,"Eva Thibodeaux ""Nini Thibodeaux""",1/2,5.0,1253232000,great book,Great book! If at the moment you are not livin...
298,23 Minutes In Hell: One Man's Story About What...,New York Times Best Seller and Over 1 million ...,['Bill Wiese'],Charisma Media,2010-09-24,['Religion'],10.39,,No Big E,1/2,5.0,1260316800,Very Good,It was a scary read through the first 3 chapte...


In [5]:
spark_session.stop()