# **Import Libraries**

In [1]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *
import os

# **Create Dataframe**

In [2]:
# You need to have Java installed to run Spark.
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-22"


spark = SparkSession.builder.appName("Google Play Store Data").getOrCreate()

In [9]:
df = spark.read.csv("googleplaystore.csv", header=True, inferSchema=True)

In [10]:
# coiunt number of rows
df.count()

10841

In [11]:
# show first 3 rows
df.show(3)

+--------------------+--------------+------+-------+----+----------+----+-----+--------------+--------------------+----------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|  Installs|Type|Price|Content Rating|              Genres|    Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+----------+----+-----+--------------+--------------------+----------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|   10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|      1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|  500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|      2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M|5,000,000+|Free|    0|      Everyone|        Art & Design|  August 1, 2018|      1.2.4|4.0.3 and up|
+--------------------+--------------+------+-------+

In [12]:
# Check schema
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



# **Data Cleaning step**

In [13]:
#dropping not important columns 

df = df.drop("size","Content Rating","Last Updated","Android Ver","Current Ver")
df.show(2)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|500,000+|Free|    0|Art & Design;Pret...|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
only showing top 2 rows



In [23]:
unique_values_type = df.select('type').distinct()
unique_values_type.show()

+------+
|  type|
+------+
|     0|
|102248|
|   NaN|
|  Free|
|  Paid|
|  2509|
+------+



In [15]:
df=df.withColumn("Rating",col("Rating").cast("float"))\
    .withColumn("Reviews",col("Reviews").cast("int"))\
    .withColumn()