## Exploration of map reduce result

---

### Import Libraries

In [1]:
# import libraries
import findspark
import pandas as pd
import pyspark as ps
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession


### Initialize Spark

In [2]:
# Locate the spark installation
findspark.init()

# Initialize a SparkContext
spark = SparkSession.builder.appName("data_cleaning").getOrCreate()
spark.stop()
sc = ps.SparkContext(appName="prior_analysis")
# Initialize the Session
spark_session = ps.sql.SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/11 12:02:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Connect and import data from HDFS directly into a Spark DataFrame

## Check missing titles

In [3]:
joined_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True),
    StructField("N_helpful", IntegerType(), True),
    StructField("Tot_votes", IntegerType(), True)    
    ])


data_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("categories", StringType(), True),
])

ratings_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
])

In [4]:
df_join = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/joined_tables', schema=joined_schema, sep='\t')
df_data = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/books_data_cleaned.csv', schema=data_schema, sep='\t')
df_rating = spark_session.read.csv('hdfs://localhost:9900/user/book_reviews/books_rating_cleaned.csv', schema=ratings_schema, sep='\t')

In [5]:
df_join_title = df_join.select('Title').toPandas().drop_duplicates().sort_values(by='Title', ascending=True)
df_data_title = df_data.select('Title').toPandas().drop_duplicates().sort_values(by='Title', ascending=True)
df_rating_title = df_rating.select('Title').toPandas().drop_duplicates().sort_values(by='Title', ascending=True)

                                                                                

In [6]:
print('join:', df_join_title.shape)
print('data:', df_data_title.shape)
print('rating:', df_rating_title.shape)

join: (171202, 1)
data: (212400, 1)
rating: (212400, 1)


In [14]:
# perform the join of df_data and df_rating
df_data_rating = pd.merge(
    df_data_title, df_rating_title, how='inner', on=['Title'])

# sort the dataframes and reset the index
df_data_rating = df_data_rating.reset_index(drop=True).sort_values(by='Title', ascending=True)
df_join_title = df_join_title.reset_index(drop=True).sort_values(by='Title', ascending=True)

# Merge the DataFrames using the indicator parameter
merged = pd.merge(df_data_rating, df_join_title, on=list(df_data_rating.columns), how='left', indicator=True)

# Filter rows where the indicator column is 'left_only'
result = merged[merged['_merge'] == 'left_only']

# Drop the indicator column if you don't need it in the final result
missed_titles = result.drop(columns=['_merge'])

# result now contains the elements that are in df1 but not in df2
missed_titles.head(300)

Unnamed: 0,Title
0,#1 Bimini Road
1,#1 Casca The Eternal Mercenary
2,#1 Web Marketing : Achieving Top 10 Rankings i...
5,$5 Dollar Menus for Two
6,&#161;Buen viaje! Level 2 Student Edition (Gle...
...,...
1694,7 Secrets Of Highly Successful Kids (Millenniu...
1696,7 Simple Steps to Unclutter Your Life
1698,7 Steps to Fearless Speaking
1703,7 Trips Through Time and Space
