In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import *

In [0]:
spark = SparkSession.builder.appName('Project').getOrCreate()

In [0]:
df = spark.read.csv("/FileStore/tables/googleplaystore.csv", inferSchema=True, header=True)

In [0]:
df.show(2)

+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+----------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|              Genres|    Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+----------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M| 10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|      1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|      2.0.0|4.0.3 and up|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+----------------+-----------+------------+
only showing top 2 rows



In [0]:
df.count()

Out[5]: 10841

In [0]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [0]:
df = df.drop('Size','Content Rating', 'Last Updated', 'Android Ver', 'Current Ver')

In [0]:
df.show(30)

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    50,000+|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|    50,000+|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_

In [0]:
df = df.dropna(how='any')

In [0]:
df = df.withColumn("Review",col('Reviews').cast(IntegerType()))\
.withColumn("Installs",regexp_replace(col("Installs"),"[^0-9]",""))\
    .withColumn("Installs",col("Installs").cast(IntegerType()))\
        .withColumn("Price",regexp_replace(col("Price"),"[$]",""))\
            .withColumn("Price",col("Price").cast(IntegerType()))

In [0]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Review: integer (nullable = true)



In [0]:
df.show(20)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|Review|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|   159|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|   967|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design| 87510|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|215644|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|   967|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|   167|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|    0|        Art & Desig

In [0]:
df.createTempView('gapps')

In [0]:
%sql select *from gapps limit 10

App,Category,Rating,Reviews,Installs,Type,Price,Genres,Review
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Art & Design,159
Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Art & Design;Pretend Play,967
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Art & Design,87510
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Art & Design,215644
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Art & Design;Creativity,967
Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0,Art & Design,167
Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0,Art & Design,178
Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0,Art & Design,36815
Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0,Art & Design,13791
Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0,Art & Design;Creativity,121


In [0]:
%sql SELECT App,sum(Reviews) From gapps
GROUP BY App
ORDER BY sum(Reviews) Desc LIMIT 10 

App,sum(Reviews)
Instagram,266241989.0
WhatsApp Messenger,207348304.0
Clash of Clans,179558781.0
Messenger – Text and Video Chat for Free,169932272.0
Subway Surfers,166331958.0
Candy Crush Saga,156993136.0
Facebook,156286514.0
8 Ball Pool,99386198.0
Clash Royale,92530298.0
Snapchat,68045010.0


In [0]:
%sql SELECT App,sum(Installs) FROM gapps
GROUP BY App 
ORDER BY sum(Installs) DESC
LIMIT 10

App,sum(Installs)
Subway Surfers,6000000000
Instagram,4000000000
Hangouts,4000000000
Google Drive,4000000000
Google News,4000000000
Google Photos,4000000000
Candy Crush Saga,3500000000
WhatsApp Messenger,3000000000
Messenger – Text and Video Chat for Free,3000000000
Google Chrome: Fast & Secure,3000000000


In [0]:
%sql select Category,sum(Installs) FROM gapps
group by Category
order by sum(Installs) desc 
limit 10

Category,sum(Installs)
GAME,35086024415
COMMUNICATION,32647276251
PRODUCTIVITY,14176091369
SOCIAL,14069867902
TOOLS,11452771915
FAMILY,10258263505
PHOTOGRAPHY,10088247655
NEWS_AND_MAGAZINES,7496317760
TRAVEL_AND_LOCAL,6868887146
VIDEO_PLAYERS,6222002720


In [0]:
%sql SELECT App,sum(installs) From gapps
Where Type = "Paid"
GROUP BY App
ORDER BY sum(installs) DESC
LIMIT 10

App,sum(installs)
Minecraft,20000000
Hitman Sniper,10000000
Facetune - For Free,3000000
Beautiful Widgets Pro,2000000
HD Widgets,2000000
Tasker,1000000
Fruit Ninja Classic,1000000
True Skate,1000000
Where's My Water?,1000000
DraStic DS Emulator,1000000


In [0]:
%sql SELECT App,sum(Reviews) From gapps
Where Type = "Paid"
GROUP BY App
ORDER BY sum(Reviews) DESC
LIMIT 10

App,sum(Reviews)
Minecraft,4751900.0
Hitman Sniper,408292.0
Grand Theft Auto: San Andreas,348962.0
Beautiful Widgets Pro,195780.0
Bloons TD 5,190086.0
Where's My Water?,188740.0
Facetune - For Free,148659.0
Card Wars - Adventure Time,129603.0
True Skate,129409.0
HD Widgets,117231.0
