In [None]:
from pyspark import *;
from pyspark.sql import *;
from pyspark.sql.functions import *;
import pandas as pd;

In [None]:
spark = SparkSession.builder.appName("Q7").getOrCreate()

df = spark.read.csv("phone_usage_india.csv",header=True,inferSchema=True)
df.printSchema()
df.show()

root
 |-- User ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Phone Brand: string (nullable = true)
 |-- OS: string (nullable = true)
 |-- Screen Time (hrs/day): double (nullable = true)
 |-- Data Usage (GB/month): double (nullable = true)
 |-- Calls Duration (mins/day): double (nullable = true)
 |-- Number of Apps Installed: integer (nullable = true)
 |-- Social Media Time (hrs/day): double (nullable = true)
 |-- E-commerce Spend (INR/month): integer (nullable = true)
 |-- Streaming Time (hrs/day): double (nullable = true)
 |-- Gaming Time (hrs/day): double (nullable = true)
 |-- Monthly Recharge Cost (INR): integer (nullable = true)
 |-- Primary Use: string (nullable = true)

+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+---------

In [None]:
df.select([count(when(isnull(c)|isnan(c),c)).alias(c) for c in df.columns]).show()

+-------+---+------+--------+-----------+---+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-----------+
|User ID|Age|Gender|Location|Phone Brand| OS|Screen Time (hrs/day)|Data Usage (GB/month)|Calls Duration (mins/day)|Number of Apps Installed|Social Media Time (hrs/day)|E-commerce Spend (INR/month)|Streaming Time (hrs/day)|Gaming Time (hrs/day)|Monthly Recharge Cost (INR)|Primary Use|
+-------+---+------+--------+-----------+---+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-----------+
|      0|  0|     0|      18|          0|  0|                    0|                    0|                        0|                       0|     

In [None]:
df = df.filter((col("Age")>=18)&(col("Age")<=90))
df.show()

+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+
|User ID|Age|Gender| Location| Phone Brand|     OS|Screen Time (hrs/day)|Data Usage (GB/month)|Calls Duration (mins/day)|Number of Apps Installed|Social Media Time (hrs/day)|E-commerce Spend (INR/month)|Streaming Time (hrs/day)|Gaming Time (hrs/day)|Monthly Recharge Cost (INR)|  Primary Use|
+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+
| U00001| 53|  Male|   Mumbai|        Vivo|Android|                  3.7|                 23.9|                     37.9|

In [None]:
df = df.withColumn("Age_Category",
                   when(col("Age")<=35,"Young").
                   when(col("Age")<=60,"Middle-aged").
                   otherwise("Senior"))
df.show()

+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+------------+
|User ID|Age|Gender| Location| Phone Brand|     OS|Screen Time (hrs/day)|Data Usage (GB/month)|Calls Duration (mins/day)|Number of Apps Installed|Social Media Time (hrs/day)|E-commerce Spend (INR/month)|Streaming Time (hrs/day)|Gaming Time (hrs/day)|Monthly Recharge Cost (INR)|  Primary Use|Age_Category|
+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+------------+
| U00001| 53|  Male|   Mumbai|        Vivo|Android|                  3.7|         

In [None]:
df = df.fillna({"Location":"Unknown"})
df.show()

+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+------------+
|User ID|Age|Gender| Location| Phone Brand|     OS|Screen Time (hrs/day)|Data Usage (GB/month)|Calls Duration (mins/day)|Number of Apps Installed|Social Media Time (hrs/day)|E-commerce Spend (INR/month)|Streaming Time (hrs/day)|Gaming Time (hrs/day)|Monthly Recharge Cost (INR)|  Primary Use|Age_Category|
+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+------------+
| U00001| 53|  Male|   Mumbai|        Vivo|Android|                  3.7|         

In [None]:
df = df.withColumn("ScreenTime_Minutes", col("Screen Time (hrs/day)")*60)
df = df.withColumn("SocialMediaTime_Minutes", col("Social Media Time (hrs/day)")*60)
df = df.withColumn("StreamingTime_Minutes", col("Streaming Time (hrs/day)")*60)
df = df.withColumn("GamingTime_Minutes", round(col("Gaming Time (hrs/day)")*60,2))
df.show()

+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+------------+------------------+-----------------------+---------------------+------------------+
|User ID|Age|Gender| Location| Phone Brand|     OS|Screen Time (hrs/day)|Data Usage (GB/month)|Calls Duration (mins/day)|Number of Apps Installed|Social Media Time (hrs/day)|E-commerce Spend (INR/month)|Streaming Time (hrs/day)|Gaming Time (hrs/day)|Monthly Recharge Cost (INR)|  Primary Use|Age_Category|ScreenTime_Minutes|SocialMediaTime_Minutes|StreamingTime_Minutes|GamingTime_Minutes|
+-------+---+------+---------+------------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+-----------------

In [None]:
df = df.withColumn("Screen_Time_Category",
                       when(col("ScreenTime_Minutes")<=180,"Low-Usage").
                       when(col("ScreenTime_Minutes")<=360,"Moderate-Usage").
                       otherwise("High-Usage"))
df.select("Screen_Time_Category").distinct().show()

+--------------------+
|Screen_Time_Category|
+--------------------+
|          High-Usage|
|           Low-Usage|
|      Moderate-Usage|
+--------------------+



In [None]:
primary_use_case = df.groupBy("Primary Use").agg(sum(col("Monthly Recharge Cost (INR)")).alias("Total Cost")).orderBy("Total Cost")
primary_use_case.show()

+-------------+----------+
|  Primary Use|Total Cost|
+-------------+----------+
|Entertainment|   3302462|
| Social Media|   3433779|
|       Gaming|   3464171|
|    Education|   3478342|
|         Work|   3499693|
+-------------+----------+



In [None]:
df = df.toPandas()
df.to_csv("cleaned_7.csv")

In [None]:
spark.stop()