### Import libraries

In [117]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.functions import *

### Create a Spark session

In [3]:
spark = SparkSession.builder.appName('Googel_apps').getOrCreate()

### Import a the googlestore.csv dataset

In [158]:
df = spark.read.csv('googlestore.csv', sep = ';', header = True, escape = '"', inferSchema = True)

# Show the first 5 rows
df.show(5)

+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|   Installs|Type|Price|Content Rating|              Genres|    Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+------------------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|    10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|             1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|   500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|             2.0.0|4.0.3 and up|
|U Launcher Lite â...|ART_AND_DESIGN|   4.7|  87510|8.7M| 5,000,000+|Free|    0|      Everyone|        Art & Design|  August 1, 2018|             1.2.4|4.0.3 and up|
|Ske

#### Number of records of the dataset

In [159]:
df.count()

10841

#### Check the data types of the columns

In [160]:
df.dtypes

[('App', 'string'),
 ('Category', 'string'),
 ('Rating', 'double'),
 ('Reviews', 'int'),
 ('Size', 'string'),
 ('Installs', 'string'),
 ('Type', 'string'),
 ('Price', 'string'),
 ('Content Rating', 'string'),
 ('Genres', 'string'),
 ('Last Updated', 'string'),
 ('Current Ver', 'string'),
 ('Android Ver', 'string')]

#### Check Schema

In [161]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



As we can see, some column types are incorrectly assigned even though InferSchema is set to True,so it is necessary to manually assign the correct types to the columns.

### Data cleaning

#### Drop non informational columns

In [162]:
# Drop column 'Size', 'Content Rating', 'Last Updated', 'Android Ver'
df = df.drop('Size', 'Content Rating', 'Last Updated', 'Android Ver', 'Current Ver')

In [163]:
df.show()

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|
|U Launcher Lite â...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    50,000+|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|    50,000+|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_

In [164]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)



#### Set the correct types for each column

Transform the "Rating" column into double type and the "Reviews" column into integer type. For the "Installs" and "Price" columns, we need to remove the '+' (everything other than a digit) and '$' symbol before transforming them into integer type.

In [165]:
df = df.withColumn("Reviews", col("Reviews").cast(IntegerType()))\
.withColumn("Installs", regexp_replace(col("Installs"), "[^0-9]", ""))\
    .withColumn("Installs", col("Installs").cast(IntegerType()))\
        .withColumn("Price", regexp_replace(col("Price"), "[$]", ""))\
            .withColumn("Price", col("Price").cast(IntegerType()))

In [166]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Genres: string (nullable = true)



In [167]:
df.show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite â...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815| 1000000

### Query the data

For performing SQL queries on the data we have to create a view.

In [168]:
df.createOrReplaceTempView('apps')

In [169]:
query = "select * from apps"
result = spark.sql(query)
result.show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite â...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815| 1000000

Equivalent on PySpark queries

In [170]:
df.show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite â...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815| 1000000

### Top 10 reviews given to the apps

We have to sum all the reviews for each app (grouping by each app) and order them from the largest to the smalest sum.

In [105]:
query = "SELECT App, sum(Reviews) AS Sum_Reviews FROM apps GROUP BY App ORDER BY Sum_Reviews DESC"
result = spark.sql(query)
result.show(10)

+--------------------+-----------+
|                 App|Sum_Reviews|
+--------------------+-----------+
|           Instagram|  266241989|
|  WhatsApp Messenger|  207348304|
|      Clash of Clans|  179558781|
|Messenger â€“ Tex...|  169932272|
|      Subway Surfers|  166331958|
|    Candy Crush Saga|  156993136|
|            Facebook|  156286514|
|         8 Ball Pool|   99386198|
|        Clash Royale|   92530298|
|            Snapchat|   68045010|
+--------------------+-----------+
only showing top 10 rows



Equivalent on PySpark queries

In [176]:
df.groupBy('App').sum('Reviews').orderBy(sum('Reviews'), ascending = False).show(10)

+--------------------+------------+
|                 App|sum(Reviews)|
+--------------------+------------+
|           Instagram|   266241989|
|  WhatsApp Messenger|   207348304|
|      Clash of Clans|   179558781|
|Messenger â€“ Tex...|   169932272|
|      Subway Surfers|   166331958|
|    Candy Crush Saga|   156993136|
|            Facebook|   156286514|
|         8 Ball Pool|    99386198|
|        Clash Royale|    92530298|
|            Snapchat|    68045010|
+--------------------+------------+
only showing top 10 rows



### Top 10 instaled apps

We have to sum all the instalations (Total installs) for each app (grouping by each app) and order them from the largest to the smalest number.

In [184]:
query = "SELECT App, Type, sum(Installs) AS Total_Installs FROM apps GROUP BY App, Type ORDER BY Total_Installs DESC, App ASC"
result = spark.sql(query)
result.show(10)

+--------------------+----+--------------+
|                 App|Type|Total_Installs|
+--------------------+----+--------------+
|      Subway Surfers|Free|    6000000000|
|        Google Drive|Free|    4000000000|
|         Google News|Free|    4000000000|
|       Google Photos|Free|    4000000000|
|            Hangouts|Free|    4000000000|
|           Instagram|Free|    4000000000|
|    Candy Crush Saga|Free|    3500000000|
|               Gmail|Free|    3000000000|
|Google Chrome: Fa...|Free|    3000000000|
|Maps - Navigate &...|Free|    3000000000|
+--------------------+----+--------------+
only showing top 10 rows



Equivalent on PySpark queries

In [191]:
df.groupBy('App').sum('Installs').orderBy([desc('sum(Installs)'), 'App']).show(10)

+--------------------+-------------+
|                 App|sum(Installs)|
+--------------------+-------------+
|      Subway Surfers|   6000000000|
|        Google Drive|   4000000000|
|         Google News|   4000000000|
|       Google Photos|   4000000000|
|            Hangouts|   4000000000|
|           Instagram|   4000000000|
|    Candy Crush Saga|   3500000000|
|               Gmail|   3000000000|
|Google Chrome: Fa...|   3000000000|
|Maps - Navigate &...|   3000000000|
+--------------------+-------------+
only showing top 10 rows



### Category distribution of the instaled apps

Determine the total number of installations for each category of application.

In [107]:
query = "SELECT Category, sum(Installs) AS Total_Installs FROM apps GROUP BY Category ORDER BY Total_Installs DESC"
result = spark.sql(query)
result.show(10)

+------------------+--------------+
|          Category|Total_Installs|
+------------------+--------------+
|              GAME|   35086024415|
|     COMMUNICATION|   32647276251|
|      PRODUCTIVITY|   14176091369|
|            SOCIAL|   14069867902|
|             TOOLS|   11452771915|
|            FAMILY|   10258263505|
|       PHOTOGRAPHY|   10088247655|
|NEWS_AND_MAGAZINES|    7496317760|
|  TRAVEL_AND_LOCAL|    6868887146|
|     VIDEO_PLAYERS|    6222002720|
+------------------+--------------+
only showing top 10 rows



Equivalent on PySpark queries

In [195]:
df.groupBy('Category').sum('Installs').orderBy(desc(sum('Installs'))).show(10)

+------------------+-------------+
|          Category|sum(Installs)|
+------------------+-------------+
|              GAME|  35086024415|
|     COMMUNICATION|  32647276251|
|      PRODUCTIVITY|  14176091369|
|            SOCIAL|  14069867902|
|             TOOLS|  11452771915|
|            FAMILY|  10258263505|
|       PHOTOGRAPHY|  10088247655|
|NEWS_AND_MAGAZINES|   7496317760|
|  TRAVEL_AND_LOCAL|   6868887146|
|     VIDEO_PLAYERS|   6222002720|
+------------------+-------------+
only showing top 10 rows



### Top paid apps

In [108]:
query = "SELECT App, sum(Price) AS Total_Price from apps WHERE Type == 'Paid' GROUP BY App ORDER BY Total_Price DESC"
result = spark.sql(query)
result.show(10)

+--------------------+-----------+
|                 App|Total_Price|
+--------------------+-----------+
|I'm Rich - Trump ...|        400|
|most expensive ap...|        399|
|           I am Rich|        399|
|  I AM RICH PRO PLUS|        399|
|   I Am Rich Premium|        399|
|  I am rich(premium)|        399|
|      I am Rich Plus|        399|
|       I Am Rich Pro|        399|
|          I am Rich!|        399|
|I'm Rich/Eu sou R...|        399|
+--------------------+-----------+
only showing top 10 rows



Equivalent on PySpark queries

In [196]:
# Filtering the dataset for the columns where Type is equal to Paid
df_filtered = df.where(col('Type') == 'Paid')

# Grouping by app and calculate the sum of Price
df_filtered.groupBy('App').sum('Price').orderBy(sum('Price'), ascending = False).show(10)

+--------------------+----------+
|                 App|sum(Price)|
+--------------------+----------+
|I'm Rich - Trump ...|       400|
|most expensive ap...|       399|
|           I am Rich|       399|
|  I AM RICH PRO PLUS|       399|
|   I Am Rich Premium|       399|
|  I am rich(premium)|       399|
|      I am Rich Plus|       399|
|       I Am Rich Pro|       399|
|          I am Rich!|       399|
|I'm Rich/Eu sou R...|       399|
+--------------------+----------+
only showing top 10 rows



### Top paid rating apps

In [155]:
# Has we have NaN values in the column Rating, the RLIKE function was applied in order to filter the values througth the regular expression '^[0-9]+\\.?[0-9]*$', to check that the Rating value contains only digits and optionally a decimal point followed by more digits. 
query = "SELECT App, AVG(Rating) AS Ratings FROM apps WHERE Type = 'Paid' AND Rating RLIKE '^[0-9]+\\.?[0-9]*$' GROUP BY App ORDER BY Ratings DESC"

result = spark.sql(query)
result.show(10)

+--------------------+-------+
|                 App|Ratings|
+--------------------+-------+
|   AJ Blue Icon Pack|    5.0|
|        ADS-B Driver|    5.0|
|             Mu.F.O.|    5.0|
|AJ Gray Dark Icon...|    5.0|
|30WPM Amateur ham...|    5.0|
|AP Art History Fl...|    5.0|
|     P-Home for KLWP|    5.0|
|Hey AJ! It's Bedt...|    5.0|
|        Morse Player|    5.0|
|            Ra Ga Ba|    5.0|
+--------------------+-------+
only showing top 10 rows



In [136]:
# Filtering the columns where Rating is not NaN and Type is equal to Paid
df_filtered = df.where(~isnan(col('Rating')) & (col('Type') == 'Paid'))

# Group by app and calculate the mean of Rating
df_filtered.groupBy('App').mean('Rating').orderBy('avg(Rating)', ascending=False).show(10)

+--------------------+-----------+
|                 App|avg(Rating)|
+--------------------+-----------+
|   AJ Blue Icon Pack|        5.0|
|        ADS-B Driver|        5.0|
|             Mu.F.O.|        5.0|
|AJ Gray Dark Icon...|        5.0|
|30WPM Amateur ham...|        5.0|
|AP Art History Fl...|        5.0|
|     P-Home for KLWP|        5.0|
|Hey AJ! It's Bedt...|        5.0|
|        Morse Player|        5.0|
|            Ra Ga Ba|        5.0|
+--------------------+-----------+
only showing top 10 rows

