## Exploration of map reduce result

---

### Import Libraries

In [21]:
# import libraries
import findspark
import pandas as pd
import pyspark as ps
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession


### Initialize Spark

In [22]:
# Locate the spark installation
findspark.init()

# Initialize a SparkContext
spark = SparkSession.builder.appName("data_cleaning").getOrCreate()
spark.stop()
sc = ps.SparkContext(appName="prior_analysis")
# Initialize the Session
spark_session = ps.sql.SparkSession(sc)

23/09/09 18:44:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/09/09 18:44:19 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Connect and import data from HDFS directly into a Spark DataFrame

In [23]:
# Title, description, authors, publisher, publishedDate, categories, Price, User_id, profileName, review/helpfulness, review/score, review/time, review/summary, review/text

joined_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)    
    ])

df_join_result = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/joined_tables', schema=joined_schema, sep='\t')
#df_join_result.printSchema()
#df_join_result.describe().show()

df_join_result.count()

                                                                                

2448529

In [4]:
tmp = df_join_result.limit(300).toPandas()
tmp.head(300)


Unnamed: 0,Title,description,authors,publisher,publishedDate,categories,Price,User_id,profileName,review/score,review/time,review/summary,review/text,N_helpful,Tot_votes
0,"$1,265 Gold",Profitable trade set-ups from StockTwits leadi...,"['Howard Lindzon', 'Philip Pearlman', 'Ivaylo ...",John Wiley & Sons,2011-06-09,['Business & Economics'],75.0,,Jerry Hickel,1.0,1111104000,Gold Drivel,This book earns one star merely because it p...,14,15
1,$25 and Under 1998: Your Guide to the Best Ine...,With this completely updated 4th edition of hi...,['Eric Asimov'],Harper Paperbacks,1997-08-16,['Travel'],,,,4.0,954201600,Help for Impoverished Newcomers or Curious Local,Upon deplaning in JFK I was hungry and lost. T...,0,0
2,$25 and Under 1998: Your Guide to the Best Ine...,With this completely updated 4th edition of hi...,['Eric Asimov'],Harper Paperbacks,1997-08-16,['Travel'],,,Hiram Gomez Pardo,5.0,1132790400,There is not more sincere love than the love f...,This ineffable statement of Georges Bernard Sh...,0,0
3,'A Great Effusion of Blood'?: Interpreting Med...,Exploring the issue from both historical and l...,"['Mark D. Meyerson', 'Daniel Thiery', 'Oren Fa...",University of Toronto Press,2004-01-01,['History'],74.0,,Midwest Book Review,5.0,1104969600,An invaluable tool for history and literature ...,"Compiled by academicians Mark D. Meyerson, Dan...",1,1
4,'A Hell of a Place to Lose a Cow': An American...,A noted cultural critic and NPR essayist offer...,['Tim Brookes'],,2001,['Biography & Autobiography'],,,Erik P Boucher,5.0,980294400,A Hell of a Place to Lose a Cow,Very entertaining!! After 25 years of the same...,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,(ESV) English Standard Version Large Print Bib...,"The ESV Study Bible, Large Print edition trans...",['Crossway Bibles'],,2014-07-31,['Bibles'],,,,5.0,1030579200,ESV- Highly Accurate and Readable Translation,"As a student learning Greek in College, I have...",10,10
296,(ESV) English Standard Version Large Print Bib...,"The ESV Study Bible, Large Print edition trans...",['Crossway Bibles'],,2014-07-31,['Bibles'],,,A. M. Leveille,5.0,1258329600,extras and price make it perfect as a gift,The ESV translation is recent and very good: t...,10,10
297,(ESV) English Standard Version Large Print Bib...,"The ESV Study Bible, Large Print edition trans...",['Crossway Bibles'],,2014-07-31,['Bibles'],,,gradiefrederick3,4.0,1059091200,A Great Text...but the binding still has a way...,The English Standard Version (ESV) is an excel...,17,19
298,(ESV) English Standard Version Large Print Bib...,"The ESV Study Bible, Large Print edition trans...",['Crossway Bibles'],,2014-07-31,['Bibles'],,,W. Clay Knick,5.0,1059350400,Best Edition of ESV,The ESV Deluxe in hardback and genuine leather...,14,15


In [6]:
spark_session.stop()

## Check missing titles

In [43]:
joined_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)    
    ])


data_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("categories", StringType(), True),
])

ratings_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
])

In [55]:
df_join = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/joined_tables.csv', schema=joined_schema, sep='\t')
df_data = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/books_data_cleaned.csv', schema=data_schema, sep='\t')
df_rating = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/books_rating_cleaned.csv', schema=ratings_schema, sep='\t')

In [62]:
df_join_title = df_join.select('Title').toPandas().drop_duplicates().sort_values(by='Title', ascending=True)
df_data_title = df_data.select('Title').toPandas().drop_duplicates().sort_values(by='Title', ascending=True)
df_rating_title = df_rating.select('Title').toPandas().drop_duplicates().sort_values(by='Title', ascending=True)

                                                                                

(212400, 1)

In [63]:
print('join:', df_join_title.shape)
print('data:', df_data_title.shape)
print('rating:', df_rating_title.shape)

join: (171202, 1)
data: (212400, 1)
rating: (212400, 1)


In [67]:
df_join_title.head(50)

Unnamed: 0,Title
0,"$1,265 Gold"
1,$25 and Under 1998: Your Guide to the Best Ine...
3,'A Great Effusion of Blood'?: Interpreting Med...
4,'A Hell of a Place to Lose a Cow': An American...
14,"'AN ESSAY CONCERNING HUMAN UNDERSTANDING,'"
27,'BYRON OF ''THE WAGER'''
28,'Down to Earth' Strafing Aces of the Eighth Ai...
30,"'FROM APOCALYPSE TO GENESIS: ECOLOGY, FEMINISM..."
31,'Filming of Gone With the Wind'
35,'Gator


In [68]:
df_data_title.head(50)

Unnamed: 0,Title
161739,#1 Bimini Road
47571,#1 Casca The Eternal Mercenary
110561,#1 Web Marketing : Achieving Top 10 Rankings i...
203426,"$1,265 Gold"
114917,$25 and Under 1998: Your Guide to the Best Ine...
176882,$5 Dollar Menus for Two
183034,&#161;Buen viaje! Level 2 Student Edition (Gle...
65836,&#34;I AM&#34; Discourses (Saint Germain Serie...
105061,&#34;Just&#34;
58807,&#34;Licentious Liberty&#34; in a Brazilian Go...


In [69]:
df_rating_title.head(50)

Unnamed: 0,Title
2095282,#1 Bimini Road
674628,#1 Casca The Eternal Mercenary
1554749,#1 Web Marketing : Achieving Top 10 Rankings i...
2689120,"$1,265 Gold"
1617879,$25 and Under 1998: Your Guide to the Best Ine...
2310240,$5 Dollar Menus for Two
2391369,&#161;Buen viaje! Level 2 Student Edition (Gle...
929966,&#34;I AM&#34; Discourses (Saint Germain Serie...
1478863,&#34;Just&#34;
819170,&#34;Licentious Liberty&#34; in a Brazilian Go...
