## Data Exploration with Spark

---

### Import Libraries

In [None]:
# import libraries
import findspark

# Locate the spark installation
findspark.init()

In [None]:
import pandas as pd
import pyspark as ps
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession


### Initialize Spark

In [None]:
# Initialize a SparkContext
spark = SparkSession.builder.appName("prior_analysis").getOrCreate()

### Connect and import data from HDFS directly into a Spark DataFrame

In [None]:
# Initialize the Session
#spark = ps.sql.SparkSession(sc)

# Define schema for better manipulation

data_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("image", StringType(), True),
    StructField("previewLink", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("infoLink", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("ratingsCount", FloatType(), True)
])

ratings_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
])


# Load the data

df_data = spark.read.option('escape','"').csv('hdfs://localhost:9900/user/book_reviews/original_data/books_data.csv', header=True, schema=data_schema)
df_ratings = spark.read.option('escape','"').csv('hdfs://localhost:9900/user/book_reviews/original_data/books_rating.csv', header=True, schema=ratings_schema)

### Data Exploration

- Show the first 5 rows of the data
- Investigate the inferred schema of the data
- Discover data dimensionality
- Show some statistics
- Discover null values
- Discover the number of distinct values in each column

In [None]:
# Show the data
print('Data Table: \n')
df_data.limit(5).toPandas()

In [None]:
print('Ratings Table: \n')
df_ratings.limit(5).toPandas()

In [None]:
#Investigate the schema
print('Data Table Schema: \n')
df_data.printSchema()

print('Ratings Table Schema: \n')
df_ratings.printSchema()

In [None]:
# Check dimensionality
print(f'Data Table Dimensionality: {df_data.count(), len(df_data.columns)}')
print(f'Ratings Table Dimensionality: {df_ratings.count(), len(df_ratings.columns)}')

# Statistical summary
print('Data Table Summary: \n')
df_data.describe().show()

print('Ratings Table Summary: \n')
df_ratings.describe().show()

In [None]:
import numpy as np

# Check for number of distinct values for each column in %
n_distinct_list = []

for c in df_data.columns:
    n_distinct = df_data.select(c).distinct().count()
    n_distinct_list.append(n_distinct)

df_data_pandas = pd.DataFrame(index = df_data.columns, columns = ['N. Distinct Values'], data = (np.array((n_distinct_list))/df_data.count())*100)

n_distinct_list = []

for c in df_ratings.columns:
    n_distinct = df_ratings.select(c).distinct().count()
    n_distinct_list.append(n_distinct)

df_ratings_pandas = pd.DataFrame(index = df_ratings.columns, columns = ['N. Distinct Values'], data = (np.array((n_distinct_list))/df_ratings.count())*100)

In [None]:
from pyspark.sql.functions import col, count, when

# Check for missing values
df_data_pandas_tmp = df_data.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_data.columns]).toPandas()
# Add to the pandas summary dataframe
df_data_pandas['N. Missing Values'] = (df_data_pandas_tmp.loc[0, :]/df_data.count()*100).tolist()

# Check for missing values
df_ratings_pandas_tmp = df_ratings.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_ratings.columns]).toPandas()
# Add to the pandas summary dataframe
df_ratings_pandas['N. Missing Values'] = (df_ratings_pandas_tmp.loc[0, :]/df_ratings.count()*100).tolist()

In [None]:
df_data_pandas

In [None]:
df_ratings_pandas

In [None]:
#df_data_pandas.set_index('Column', inplace=True)
df_data_pandas.plot(title='Data Table Summary', kind='bar', ylabel='Percentage')

#df_ratings_pandas.set_index('Column', inplace=True)
df_ratings_pandas.plot(title='Ratings Table Summary', kind='bar', ylabel='Percentage')