In [9]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import *
import findspark
from pyspark.sql import SparkSession

In [10]:
# Locate the Spark installation
findspark.init()

spark = SparkSession.builder\
    .appName("DuplicateRemovalAndMissingDataHandling")\
    .master("local[*]")\
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g")\
    .config("spark.storage.memoryFraction", "0.5")\
    .config("spark.shuffle.memoryFraction", "0.5")\
    .config("spark.driver.maxResultSize", "0")\
    .getOrCreate()

In [11]:

ratings_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
])

info_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("image", StringType(), True),
    StructField("previewLink", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("infoLink", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("ratingsCount", IntegerType(), True)
])


ratings_df = spark.read.csv('hdfs://localhost:9900/user/davideligari/book-reviews/books_rating.csv', header=True,schema=ratings_schema)
info_df = spark.read.csv('hdfs://localhost:9900/user/davideligari/book-reviews/books_data.csv', header=True,schema=info_schema)


In [12]:
ratings_df.limit(5).toPandas()

                                                                                

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,,"""Jim of Oz """"jim-of-oz""""""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"""If people become the books they read and if ""..."
3,826414346,Dr. Seuss: American Icon,,,"""Roy E. Perry """"amateur philosopher""""""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,,"""D. H. Richards """"ninthwavestore""""""",3/3,4.0,1107993600,Good academic overview,"""Philip Nel - Dr. Seuss: American IconThis is ..."


In [13]:
info_df.limit(5).toPandas()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,"""Philip Nel takes a fascinating look into the ...",like that of Lewis Carroll and Edward Lear,has changed language itself,"giving us new words like """"nerd."""" And Seuss'...",inspiring artists like filmmaker Tim Burton a...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [14]:
# Drop duplicates from ratings DataFrame based on ID
ratings_df = ratings_df.dropDuplicates(subset=["Id"])

# Fill missing values in ratings DataFrame with default values
ratings_df = ratings_df.fillna({"Price": 0.0, "review/score": 0.0})

# Drop duplicates from info DataFrame based on Title
info_df = info_df.dropDuplicates(subset=["Title"])

# Fill missing values in info DataFrame with default values
info_df = info_df.fillna({"authors": "Unknown", "categories": "Unknown"})

# Perform the join operation on the Title column
joined_df = ratings_df.join(info_df, on="Title", how="inner")

# Select desired columns from the joined DataFrame
selected_columns = [
    "Title", "Price", "User_id", "profileName",
    "review/helpfulness", "review/score",
    "authors", "categories"
]
result_df = joined_df.select(selected_columns)

# Show the resulting DataFrame
result_df.show()

# Stop the Spark session
spark.stop()

                                                                                

+--------------------+-----+-------+--------------------+------------------+------------+--------------------+--------------------+
|               Title|Price|User_id|         profileName|review/helpfulness|review/score|             authors|          categories|
+--------------------+-----+-------+--------------------+------------------+------------+--------------------+--------------------+
|"""Billboard"" Bo...|  0.0|   null| Lawrance M. Bernabo|             12/12|         4.0|             country|http://books.goog...|
|"""Carefree"" (R....|  0.0|   null|"Patricia R. Ande...|               0/0|         5.0|['Allan Scott', '...|             Unknown|
|          """Gizelle|  0.0|   null|       A62G4QX6XQVLP|          Anyechka|         0.0|Tells the story o...|http://books.goog...|
|"""Glory is a-com...|  0.0|   null|                Dave|               1/1|         3.0|['Martha Peterson...|         ['Indiana']|
|"""I Do""...Weddi...|  0.0|   null|                null|               0/0|