<h1><center>Analysing Books Data</center></h1>
<hr><hr>

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("002").getOrCreate()
spark

In [3]:
sc = spark.sparkContext
sc

In [4]:
sc.defaultParallelism

8

In [17]:
from pyspark.sql import functions as F

In [14]:
books_df = spark.read.csv("./data/books_data.csv", header=True, inferSchema=True)

## Getting a single value out of a Dataframe:
----------------------------------------------

In [30]:
books_df.rdd.collect()[0].books_count

272

### 1. Total numbers of records:
----------------------------------

In [19]:
books_df.count()

10000

### 2. Schema of dataframe:
-------------------------------

In [9]:
books_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- goodreads_book_id: integer (nullable = true)
 |-- best_book_id: integer (nullable = true)
 |-- work_id: integer (nullable = true)
 |-- books_count: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: double (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: integer (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: double (nullable = true)
 |-- ratings_2: integer (nullable = true)
 |-- ratings_3: integer (nullable = true)
 |-- ratings_4: integer (nullable = true)
 |-- ratings_5: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (n

In [11]:
books_df.dtypes

[('book_id', 'int'),
 ('goodreads_book_id', 'int'),
 ('best_book_id', 'int'),
 ('work_id', 'int'),
 ('books_count', 'int'),
 ('isbn', 'string'),
 ('isbn13', 'double'),
 ('authors', 'string'),
 ('original_publication_year', 'int'),
 ('original_title', 'string'),
 ('title', 'string'),
 ('language_code', 'string'),
 ('average_rating', 'string'),
 ('ratings_count', 'string'),
 ('work_ratings_count', 'string'),
 ('work_text_reviews_count', 'string'),
 ('ratings_1', 'double'),
 ('ratings_2', 'int'),
 ('ratings_3', 'int'),
 ('ratings_4', 'int'),
 ('ratings_5', 'int'),
 ('image_url', 'string'),
 ('small_image_url', 'string')]

In [10]:
books_df.show()

+-------+-----------------+------------+--------+-----------+----------+-------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
|book_id|goodreads_book_id|best_book_id| work_id|books_count|      isbn| isbn13|             authors|original_publication_year|      original_title|               title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|           image_url|     small_image_url|
+-------+-----------------+------------+--------+-----------+----------+-------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------

### 3. Selecting sub-dataframe of some columns:

In [15]:
books_df2 = books_df.select( ["book_id", "title", "books_count", "language_code", "average_rating", "ratings_count"] )
books_df2.show( truncate=False )

+-------+-----------------------------------------------------------+-----------+-------------+--------------+-------------+
|book_id|title                                                      |books_count|language_code|average_rating|ratings_count|
+-------+-----------------------------------------------------------+-----------+-------------+--------------+-------------+
|1      |The Hunger Games (The Hunger Games, #1)                    |272        |eng          |4.34          |4780653      |
|2      |Harry Potter and the Sorcerer's Stone (Harry Potter, #1)   |491        |eng          |4.44          |4602479      |
|3      |Twilight (Twilight, #1)                                    |226        |en-US        |3.57          |3866839      |
|4      |To Kill a Mockingbird                                      |487        |eng          |4.25          |3198671      |
|5      |The Great Gatsby                                           |1356       |eng          |3.89          |2683664      |


### 4. Dictinct language_code:
--------

In [16]:
books_df2.select("language_code").distinct().show()

+-------------+
|language_code|
+-------------+
|          fre|
|           en|
|        en-CA|
|          rus|
|          ind|
|          per|
|         null|
|          nor|
|          pol|
|          vie|
|          ara|
|          por|
|          swe|
|          mul|
|          eng|
|          jpn|
|           nl|
|          dan|
|        en-GB|
|          fil|
+-------------+
only showing top 20 rows



### 5. Number of books per language_code, and siplaying in descending order of count:
---------------------------------------------------------------------------------------

In [31]:
books_per_language_df = books_df2.groupBy("language_code").agg( F.count("title").alias("num_of_books") ).orderBy( F.col("num_of_books").desc() )
books_per_language_df.show(50)

+--------------------+------------+
|       language_code|num_of_books|
+--------------------+------------+
|                 eng|        6340|
|               en-US|        2069|
|                null|        1084|
|               en-GB|         257|
|                 ara|          64|
|               en-CA|          58|
|                 fre|          25|
|                 ind|          21|
|                 spa|          20|
|                 ger|          13|
|                 per|           7|
|                 jpn|           7|
|                 pol|           6|
|                 por|           6|
|                  en|           4|
|                 nor|           3|
|                 dan|           3|
|                 ita|           2|
|                 fil|           2|
|                 rus|           1|
|                 vie|           1|
|                  nl|           1|
|                 swe|           1|
|                 tur|           1|
|Bloody Jack (Bloo...|      

In [33]:
books_per_language_df.filter( F.length(F.col("language_code")) <= 5 ).show(50)

+-------------+------------+
|language_code|num_of_books|
+-------------+------------+
|          eng|        6340|
|        en-US|        2069|
|        en-GB|         257|
|          ara|          64|
|        en-CA|          58|
|          fre|          25|
|          ind|          21|
|          spa|          20|
|          ger|          13|
|          jpn|           7|
|          per|           7|
|          pol|           6|
|          por|           6|
|           en|           4|
|          nor|           3|
|          dan|           3|
|          ita|           2|
|          fil|           2|
|          rus|           1|
|          vie|           1|
|          tur|           1|
|          swe|           1|
|          rum|           1|
|          mul|           1|
|           nl|           1|
+-------------+------------+



### 6. Filtering records that have valid float values in "average_rating" column:
-------------------------------------------------------------------------------------

In [37]:
books_df2.orderBy( "average_rating" ).show()
# books_df2.orderBy( F.col("average_rating").desc() ).show()

+-------+--------------------+-----------+--------------------+--------------------+-----------------+
|book_id|               title|books_count|       language_code|      average_rating|    ratings_count|
+-------+--------------------+-----------+--------------------+--------------------+-----------------+
|   9265|    ""The Lost Boy""|          7| ""A Man Named Da...|"My Story: ""A Ch...| ""The Lost Boy""|
|   1793|One Night at the ...|         25|                 eng|                2.47|            40718|
|   3550|     The Almost Moon|         63|                 eng|                2.67|            28299|
|   8007|The Finkler Question|         45|                 eng|                2.76|             9627|
|   4009|        Four Blondes|         57|                null|                 2.8|            22278|
|   9021|                Lost|         18|                 eng|                 2.8|            12534|
|   8166|Among the Ten Tho...|         19|                 eng|          

In [38]:
from pyspark.sql.types import BooleanType
# To the col parameter of this UDF, values of each column will be passed one-by-one of all the records

@F.udf( returnType=BooleanType() )
def whether_valid_float( col ):
    col_value = str(col)
    digits = "1234567890"
    if col_value[0] in digits and col_value[-1] in digits:
        return True
    else:
        return False

In [41]:
# books_df2.filter( whether_valid_float( F.col("average_rating") ) ).orderBy( "average_rating" ).show()
books_df2.filter( whether_valid_float( "average_rating" ) ).orderBy( F.col("average_rating").desc() ).show()

+-------+--------------------+-----------+-------------+--------------+-------------+
|book_id|               title|books_count|language_code|average_rating|ratings_count|
+-------+--------------------+-----------+-------------+--------------+-------------+
|   3628|The Complete Calv...|         14|          eng|          4.82|        28900|
|   3275|Harry Potter Boxe...|         11|          eng|          4.77|        33220|
|    862|Words of Radiance...|         34|          eng|          4.77|        73572|
|   7947|     ESV Study Bible|         96|          eng|          4.76|         8953|
|   8854|Mark of the Lion ...|          6|        en-US|          4.76|         9081|
|   4483|It's a Magical Wo...|         21|          eng|          4.75|        22351|
|    422|Harry Potter Boxs...|         76|          eng|          4.74|       190050|
|   6361|There's Treasure ...|         22|          eng|          4.74|        16766|
|   3753|Harry Potter Coll...|          6|          en

### 7. Adding "movie_value" tag to be "good", "average", "bad" based on average rating:
-------------------------------------------------------------------------------------------
- average_rating >= 4 = "good"
- average_rating >= 3 and average_rating <4 = "average"
- average_rating < 3 = "bad"

In [46]:
from pyspark.sql.types import DoubleType

In [47]:
ratings_df = books_df2.select( "average_rating" ).filter( whether_valid_float("average_rating") ).withColumn( "rating", F.col("average_rating").cast(DoubleType()) )
ratings_df.orderBy("average_rating").show()

+--------------+------+
|average_rating|rating|
+--------------+------+
|          2.47|  2.47|
|          2.67|  2.67|
|          2.76|  2.76|
|           2.8|   2.8|
|           2.8|   2.8|
|          2.84|  2.84|
|          2.84|  2.84|
|          2.84|  2.84|
|          2.93|  2.93|
|          2.93|  2.93|
|          2.96|  2.96|
|          2.97|  2.97|
|          2.98|  2.98|
|             3|   3.0|
|          3.01|  3.01|
|          3.02|  3.02|
|          3.04|  3.04|
|          3.04|  3.04|
|          3.05|  3.05|
|          3.07|  3.07|
+--------------+------+
only showing top 20 rows



In [48]:
ratings_df.printSchema()

root
 |-- average_rating: string (nullable = true)
 |-- rating: double (nullable = true)



In [54]:
ratings_df.withColumn( 
    "movie_value",
    F.when( F.col("rating") >= 4, "good" ).when( (F.col("rating") < 4) & (F.col("rating") >= 3), "average" ).otherwise( "bad" )
).orderBy( F.col("rating").asc() ).show()

+--------------+------+-----------+
|average_rating|rating|movie_value|
+--------------+------+-----------+
|          2.47|  2.47|        bad|
|          2.67|  2.67|        bad|
|          2.76|  2.76|        bad|
|           2.8|   2.8|        bad|
|           2.8|   2.8|        bad|
|          2.84|  2.84|        bad|
|          2.84|  2.84|        bad|
|          2.84|  2.84|        bad|
|          2.93|  2.93|        bad|
|          2.93|  2.93|        bad|
|          2.96|  2.96|        bad|
|          2.97|  2.97|        bad|
|          2.98|  2.98|        bad|
|             3|   3.0|    average|
|          3.01|  3.01|    average|
|          3.02|  3.02|    average|
|          3.04|  3.04|    average|
|          3.04|  3.04|    average|
|          3.05|  3.05|    average|
|          3.07|  3.07|    average|
+--------------+------+-----------+
only showing top 20 rows

