In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *

In [0]:
df = spark.read.load("/FileStore/tables/googleplaystore.csv", format="csv", sep=",", escape='"', inferSchema="true", header="true")

In [0]:
df.count()

Out[20]: 10841

In [0]:
df.show(3)

+--------------------+--------------+------+-------+----+----------+----+-----+--------------+--------------------+----------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|  Installs|Type|Price|Content Rating|              Genres|    Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+----------+----+-----+--------------+--------------------+----------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|   10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|      1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|  500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|      2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M|5,000,000+|Free|    0|      Everyone|        Art & Design|  August 1, 2018|      1.2.4|4.0.3 and up|
+--------------------+--------------+------+-------+

In [0]:
# Droping columns that are not in use
df = df.drop('Size','Content Rating','Last Updated','Android Ver','Current Ver')
df.show(3)

+--------------------+--------------+------+-------+----------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|  Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+----------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500,000+|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|5,000,000+|Free|    0|        Art & Design|
+--------------------+--------------+------+-------+----------+----+-----+--------------------+
only showing top 3 rows



In [0]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)



In [0]:
df = df.withColumn('Reviews',col('Reviews').cast(IntegerType()))\
.withColumn('Installs',regexp_replace(col('Installs'),"[^0-9]",""))\
.withColumn('Installs',col('Installs').cast(IntegerType()))\
.withColumn('Price',regexp_replace(col('Price'),"[$]",""))\
.withColumn('Price',col('Price').cast(IntegerType()))
    

In [0]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Genres: string (nullable = true)



In [0]:
df.show(5)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
only showing top 5 rows



In [0]:
df.createOrReplaceTempView("apps")

In [0]:
spark.sql("select * from apps").show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815| 1000000

In [0]:
spark.sql("select App, sum(Reviews) from apps group by 1 order by 2 desc").show(5)

+--------------------+------------+
|                 App|sum(Reviews)|
+--------------------+------------+
|           Instagram|   266241989|
|  WhatsApp Messenger|   207348304|
|      Clash of Clans|   179558781|
|Messenger – Text ...|   169932272|
|      Subway Surfers|   166331958|
+--------------------+------------+
only showing top 5 rows



In [0]:
spark.sql("select App, sum(Installs) from apps group by App order by sum(Installs) desc").show(5)

+--------------+-------------+
|           App|sum(Installs)|
+--------------+-------------+
|Subway Surfers|   6000000000|
|      Hangouts|   4000000000|
|     Instagram|   4000000000|
|  Google Drive|   4000000000|
|   Google News|   4000000000|
+--------------+-------------+
only showing top 5 rows



In [0]:
spark.sql("select Category, sum(Installs) from apps group by Category order by sum(Installs) desc").show(5)

+-------------+-------------+
|     Category|sum(Installs)|
+-------------+-------------+
|         GAME|  35086024415|
|COMMUNICATION|  32647276251|
| PRODUCTIVITY|  14176091369|
|       SOCIAL|  14069867902|
|        TOOLS|  11452771915|
+-------------+-------------+
only showing top 5 rows



In [0]:
spark.sql("select App, sum(Price) from apps where Type='Paid' group by App order by 2 desc").show(5)

+--------------------+----------+
|                 App|sum(Price)|
+--------------------+----------+
|I'm Rich - Trump ...|       400|
|most expensive ap...|       399|
|         💎 I'm rich|       399|
|       I Am Rich Pro|       399|
|I'm Rich/Eu sou R...|       399|
+--------------------+----------+
only showing top 5 rows



In [0]:
%sql select App, sum(Price) from apps where Type='Paid'
group by 1 order by 2 desc

App,sum(Price)
I'm Rich - Trump Edition,400
I am Rich Plus,399
I AM RICH PRO PLUS,399
I'm Rich/Eu sou Rico/أنا غني/我很有錢,399
I Am Rich Premium,399
most expensive app (H),399
I Am Rich Pro,399
I am rich(premium),399
I am Rich,399
I am Rich!,399
