# Setting up environment and Loading data from json file into dataframe

In [1]:
# To run this file, you need to have Spark installed.
# Initializing SparkSession
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

In [None]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns

In [2]:
# Load the data
df_lighter_books = spark.read.json("lighter_books.json")
df_lighter_authors = spark.read.json("lighter_authors.json")

# EDA

In [5]:
# Print the schema of the dataframe
df_lighter_books.printSchema()

root
 |-- asin: string (nullable = true)
 |-- author_id: long (nullable = true)
 |-- author_name: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- description: string (nullable = true)
 |-- edition_information: string (nullable = true)
 |-- format: string (nullable = true)
 |-- id: long (nullable = true)
 |-- image_url: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- language: string (nullable = true)
 |-- num_pages: long (nullable = true)
 |-- original_publication_date: string (nullable = true)
 |-- publication_date: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- rating_dist: string (nullable = true)
 |-- ratings_count: long (nullable = true)
 |-- series_id: str

In [6]:
# Print the schema of the dataframe
df_lighter_authors.printSchema()

root
 |-- about: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- book_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fans_count: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- image_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ratings_count: long (nullable = true)
 |-- text_reviews_count: long (nullable = true)
 |-- work_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- works_count: long (nullable = true)



In [9]:
# summary statistics of the dataframe
df_lighter_books.describe().show()

+-------+--------+------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------+--------------------+--------+------------------+-------------------------+------------------+----------------------+--------------------+------------------+-----------------+-----------------------------------+-----------------+------------------+-------+--------------------+
|summary|    asin|         author_id|        author_name|    average_rating|         description| edition_information|            format|                  id|           image_url|    isbn|              isbn13|language|         num_pages|original_publication_date|  publication_date|             publisher|         rating_dist|     ratings_count|        series_id|                        series_name|  series_position|text_reviews_count|  title|             work_id|
+-------+--------+------------------+-------------------+-----------

In [10]:
# summary statistics of the dataframe
df_lighter_authors.describe().show()

+-------+--------------------+------------------+------------------+---------+------------------+--------------------+-------------------+-----------------+------------------+------------------+
|summary|               about|    average_rating|        fans_count|   gender|                id|           image_url|               name|    ratings_count|text_reviews_count|       works_count|
+-------+--------------------+------------------+------------------+---------+------------------+--------------------+-------------------+-----------------+------------------+------------------+
|  count|              351767|            351767|            351767|   351767|            351767|              351767|             351767|           351767|            351767|            351767|
|   mean|   1752.326923076923| 3.651194313281242|111.61573143586521|     NULL|7751861.1911975825|                NULL|           Infinity|4770.586308550831| 330.9932426862099|25.937137366495435|
| stddev|   437.460753249

In [14]:
# columns of the dataframe
df_lighter_books.columns

['asin',
 'author_id',
 'author_name',
 'authors',
 'average_rating',
 'description',
 'edition_information',
 'format',
 'id',
 'image_url',
 'isbn',
 'isbn13',
 'language',
 'num_pages',
 'original_publication_date',
 'publication_date',
 'publisher',
 'rating_dist',
 'ratings_count',
 'series_id',
 'series_name',
 'series_position',
 'shelves',
 'text_reviews_count',
 'title',
 'work_id']

In [15]:
# columns of the dataframe
df_lighter_authors.columns

['about',
 'average_rating',
 'book_ids',
 'fans_count',
 'gender',
 'id',
 'image_url',
 'name',
 'ratings_count',
 'text_reviews_count',
 'work_ids',
 'works_count']

##### For running SQL Queries Programmatically, alternatively you can run Queries using "__from pyspark.sql import functions as F__"

In [16]:
# Registering DataFrames as Views
df_lighter_books.createOrReplaceTempView("lighter_books")

In [17]:
# Registering DataFrames as Views
df_lighter_authors.createOrReplaceTempView("lighter_authors")

Now that we have loaded the data into pyspark dataframe, we can start exploring the data.
Explore the data using pyspark queries and then convert the resultant pyspark dataframe into pandas dataframe for data visualization.