In [0]:
from pyspark.sql import SparkSession
data = spark.read.table('phone_usage_india_csv_i_4_x_sl')
data.printSchema()

root
 |-- User ID: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Phone Brand: string (nullable = true)
 |-- OS: string (nullable = true)
 |-- Screen Time (hrs/day): double (nullable = true)
 |-- Data Usage (GB/month): double (nullable = true)
 |-- Calls Duration (mins/day): double (nullable = true)
 |-- Number of Apps Installed: long (nullable = true)
 |-- Social Media Time (hrs/day): double (nullable = true)
 |-- E-commerce Spend (INR/month): long (nullable = true)
 |-- Streaming Time (hrs/day): double (nullable = true)
 |-- Gaming Time (hrs/day): double (nullable = true)
 |-- Monthly Recharge Cost (INR): long (nullable = true)
 |-- Primary Use: string (nullable = true)



In [0]:
data.show(5)

+-------+---+------+---------+-----------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+
|User ID|Age|Gender| Location|Phone Brand|     OS|Screen Time (hrs/day)|Data Usage (GB/month)|Calls Duration (mins/day)|Number of Apps Installed|Social Media Time (hrs/day)|E-commerce Spend (INR/month)|Streaming Time (hrs/day)|Gaming Time (hrs/day)|Monthly Recharge Cost (INR)|  Primary Use|
+-------+---+------+---------+-----------+-------+---------------------+---------------------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+
| U00001| 53|  Male|   Mumbai|       Vivo|Android|                  3.7|                 23.9|                     37.9|    

In [0]:
data.columns

['User ID',
 'Age',
 'Gender',
 'Location',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage (GB/month)',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use']

In [0]:
## Add new column inside the data
data = data.withColumn("Screen Time in Minutes", data["Screen Time (hrs/day)"] * 60)


In [0]:
data.columns

['User ID',
 'Age',
 'Gender',
 'Location',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage (GB/month)',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use',
 'Screen Time in Minutes']

In [0]:
data.select("Screen Time in Minutes", "Screen Time (hrs/day)").show()

+----------------------+---------------------+
|Screen Time in Minutes|Screen Time (hrs/day)|
+----------------------+---------------------+
|                 222.0|                  3.7|
|                 552.0|                  9.2|
|                 270.0|                  4.5|
|                 660.0|                 11.0|
|                 132.0|                  2.2|
|                 324.0|                  5.4|
|                 360.0|                  6.0|
|                 186.0|                  3.1|
|                 318.0|                  5.3|
|                 594.0|                  9.9|
|                  96.0|                  1.6|
|                 450.0|                  7.5|
|                 630.0|                 10.5|
|                 618.0|                 10.3|
|                 102.0|                  1.7|
|                 306.0|                  5.1|
|                 186.0|                  3.1|
|                 696.0|                 11.6|
|            

In [0]:
## Renaming a column
data = data.withColumnRenamed("Data Usage (GB/month)", "Data Usage")
data.columns

['User ID',
 'Age',
 'Gender',
 'Location',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use',
 'Screen Time in Minutes']

In [0]:
data.filter(data["Age"]>30).show()

+-------+---+------+---------+------------+-------+---------------------+----------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+----------------------+
|User ID|Age|Gender| Location| Phone Brand|     OS|Screen Time (hrs/day)|Data Usage|Calls Duration (mins/day)|Number of Apps Installed|Social Media Time (hrs/day)|E-commerce Spend (INR/month)|Streaming Time (hrs/day)|Gaming Time (hrs/day)|Monthly Recharge Cost (INR)|  Primary Use|Screen Time in Minutes|
+-------+---+------+---------+------------+-------+---------------------+----------+-------------------------+------------------------+---------------------------+----------------------------+------------------------+---------------------+---------------------------+-------------+----------------------+
| U00001| 53|  Male|   Mumbai|        Vivo|Android|                  3.7|      23.9| 

In [0]:
## Remove Duplicate
df_removed_data = data.drop_duplicates()

In [0]:
print(df_removed_data.count())
print(df_removed_data.count())
## Nno Duplicates available

17686
17686


In [0]:
data.orderBy("Age", ascending=True).select("Age", "User ID", "Location", "Gender").show()

+---+-------+---------+------+
|Age|User ID| Location|Gender|
+---+-------+---------+------+
| 15| U00958|   Mumbai| Other|
| 15| U00211|  Kolkata| Other|
| 15| U00027|    Delhi|  Male|
| 15| U00118|   Jaipur| Other|
| 15| U00461|  Lucknow|  Male|
| 15| U00249|  Kolkata| Other|
| 15| U00623|Bangalore|Female|
| 15| U00471|    Delhi|  Male|
| 15| U00103|   Mumbai| Other|
| 15| U00528|     Pune|Female|
| 15| U00684|     Pune|Female|
| 15| U00770|   Mumbai| Other|
| 15| U00641|    Delhi| Other|
| 15| U00115|     Pune| Other|
| 15| U00890|     Pune|  Male|
| 15| U00527|    Delhi| Other|
| 15| U01087|Hyderabad|Female|
| 15| U00272|   Mumbai| Other|
| 15| U00593|Hyderabad| Other|
| 15| U01030|   Mumbai| Other|
+---+-------+---------+------+
only showing top 20 rows


In [0]:
data.columns

['User ID',
 'Age',
 'Gender',
 'Location',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use',
 'Screen Time in Minutes']

In [0]:
## Drop Columns
data_drop = data.drop("Location")
data_drop.columns

['User ID',
 'Age',
 'Gender',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use',
 'Screen Time in Minutes']

In [0]:
data.columns

['User ID',
 'Age',
 'Gender',
 'Location',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use',
 'Screen Time in Minutes']

In [0]:
## Operations
data.agg({"Screen Time (hrs/day)": "sum"}).show()

+--------------------------+
|sum(Screen Time (hrs/day))|
+--------------------------+
|        115779.19999999987|
+--------------------------+



In [0]:
data.agg({"Screen Time (hrs/day)": "max"}).show()

+--------------------------+
|max(Screen Time (hrs/day))|
+--------------------------+
|                      12.0|
+--------------------------+



In [0]:
data.select("Phone Brand").show()

+------------+
| Phone Brand|
+------------+
|        Vivo|
|      Realme|
|       Nokia|
|     Samsung|
|      Xiaomi|
|        Oppo|
|       Apple|
|      Realme|
|        Oppo|
|       Apple|
|       Nokia|
|        Oppo|
|      Realme|
|      Realme|
|     Samsung|
|Google Pixel|
|       Apple|
|       Apple|
|Google Pixel|
|    Motorola|
+------------+
only showing top 20 rows


In [0]:
## Brand effect in screen time
data.groupBy("Phone Brand").avg("Screen Time (hrs/day)").show()
## There is not brand effect as the avg is same

+------------+--------------------------+
| Phone Brand|avg(Screen Time (hrs/day))|
+------------+--------------------------+
|      Realme|         6.635754824063566|
|Google Pixel|         6.432793522267206|
|      Xiaomi|         6.641874653355516|
|        Oppo|         6.392948717948725|
|       Nokia|         6.613436123348007|
|        Vivo|         6.511407902058977|
|     OnePlus|         6.545213060320976|
|     Samsung|         6.600963718820852|
|       Apple|         6.523042253521117|
|    Motorola|         6.557018054746651|
+------------+--------------------------+



In [0]:
data.groupBy("Phone Brand").avg("Age").show()

+------------+------------------+
| Phone Brand|          avg(Age)|
+------------+------------------+
|      Realme|37.701475595913735|
|Google Pixel| 38.13186813186813|
|      Xiaomi|37.448696616749864|
|        Oppo| 37.30769230769231|
|       Nokia|37.272577092511014|
|        Vivo| 37.06956037840846|
|     OnePlus| 37.64748201438849|
|     Samsung|37.678004535147394|
|       Apple|  37.5769014084507|
|    Motorola|38.044263249854396|
+------------+------------------+



In [0]:
data.groupBy("Phone Brand").max("Age").show()

+------------+--------+
| Phone Brand|max(Age)|
+------------+--------+
|      Realme|      60|
|Google Pixel|      60|
|      Xiaomi|      60|
|        Oppo|      60|
|       Nokia|      60|
|        Vivo|      60|
|     OnePlus|      60|
|     Samsung|      60|
|       Apple|      60|
|    Motorola|      60|
+------------+--------+



In [0]:
data.groupBy("Gender").count().show()

+------+-----+
|Gender|count|
+------+-----+
|Female| 5969|
| Other| 5892|
|  Male| 5825|
+------+-----+



In [0]:
data.columns

['User ID',
 'Age',
 'Gender',
 'Location',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use',
 'Screen Time in Minutes']

In [0]:
data.groupBy("Gender").agg({"Data Usage": "avg"}).show()

+------+------------------+
|Gender|   avg(Data Usage)|
+------+------------------+
|Female|25.447026302563238|
| Other| 25.43238289205704|
|  Male|25.353236051502158|
+------+------------------+



In [0]:
data.groupBy("Gender").agg({"`Data Usage`": "avg"}).filter("avg(`Data Usage`) > 25.36").show()

+------+------------------+
|Gender|   avg(Data Usage)|
+------+------------------+
|Female|25.447026302563238|
| Other| 25.43238289205704|
+------+------------------+



In [0]:
data.columns

['User ID',
 'Age',
 'Gender',
 'Location',
 'Phone Brand',
 'OS',
 'Screen Time (hrs/day)',
 'Data Usage',
 'Calls Duration (mins/day)',
 'Number of Apps Installed',
 'Social Media Time (hrs/day)',
 'E-commerce Spend (INR/month)',
 'Streaming Time (hrs/day)',
 'Gaming Time (hrs/day)',
 'Monthly Recharge Cost (INR)',
 'Primary Use',
 'Screen Time in Minutes']

In [0]:
# Total screen time wrt.. os and avg
data.groupBy("OS").agg({"Data Usage": "sum", "Screen Time (hrs/day)": "avg"}).show()

+-------+------------------+--------------------------+
|     OS|   sum(Data Usage)|avg(Screen Time (hrs/day))|
+-------+------------------+--------------------------+
|    iOS|223198.20000000004|         6.532178834182248|
|Android|226225.29999999894|        6.5605468308665875|
+-------+------------------+--------------------------+



In [0]:
from pyspark.sql.functions import stddev
data.select(stddev("Age")).show()

+------------------+
|       stddev(Age)|
+------------------+
|13.338251788617141|
+------------------+



In [0]:
from pyspark.sql.functions import variance
data.select(variance("Age")).show()

+------------------+
|     variance(Age)|
+------------------+
|177.90896077654835|
+------------------+



In [0]:
from pyspark.sql.functions import concat, upper, lower, length


In [0]:
## Concatenate two columns
data.withColumn("new column", concat(data['User ID'], data['Gender'])).select("Gender", "User ID", "new column").show()

+------+-------+------------+
|Gender|User ID|  new column|
+------+-------+------------+
|  Male| U00001|  U00001Male|
| Other| U00002| U00002Other|
|Female| U00003|U00003Female|
|  Male| U00004|  U00004Male|
|  Male| U00005|  U00005Male|
|  Male| U00006|  U00006Male|
|Female| U00007|U00007Female|
| Other| U00008| U00008Other|
|Female| U00009|U00009Female|
| Other| U00010| U00010Other|
| Other| U00011| U00011Other|
|Female| U00012|U00012Female|
|  Male| U00013|  U00013Male|
|  Male| U00014|  U00014Male|
|Female| U00015|U00015Female|
|  Male| U00016|  U00016Male|
| Other| U00017| U00017Other|
|  Male| U00018|  U00018Male|
|Female| U00019|U00019Female|
| Other| U00020| U00020Other|
+------+-------+------------+
only showing top 20 rows


In [0]:
data.select("Gender").show()

+------+
|Gender|
+------+
|  Male|
| Other|
|Female|
|  Male|
|  Male|
|  Male|
|Female|
| Other|
|Female|
| Other|
| Other|
|Female|
|  Male|
|  Male|
|Female|
|  Male|
| Other|
|  Male|
|Female|
| Other|
+------+
only showing top 20 rows


In [0]:
## Convert Gender column to uppercase
data.withColumn("gender upper", upper(data['Gender'])).select("Gender", "gender upper").show()

+------+------------+
|Gender|gender upper|
+------+------------+
|  Male|        MALE|
| Other|       OTHER|
|Female|      FEMALE|
|  Male|        MALE|
|  Male|        MALE|
|  Male|        MALE|
|Female|      FEMALE|
| Other|       OTHER|
|Female|      FEMALE|
| Other|       OTHER|
| Other|       OTHER|
|Female|      FEMALE|
|  Male|        MALE|
|  Male|        MALE|
|Female|      FEMALE|
|  Male|        MALE|
| Other|       OTHER|
|  Male|        MALE|
|Female|      FEMALE|
| Other|       OTHER|
+------+------------+
only showing top 20 rows


In [0]:
## Convert Gender column to lowercase
data.withColumn("gender lower", lower(data['Gender'])).select("Gender", "gender lower").show()

+------+------------+
|Gender|gender lower|
+------+------------+
|  Male|        male|
| Other|       other|
|Female|      female|
|  Male|        male|
|  Male|        male|
|  Male|        male|
|Female|      female|
| Other|       other|
|Female|      female|
| Other|       other|
| Other|       other|
|Female|      female|
|  Male|        male|
|  Male|        male|
|Female|      female|
|  Male|        male|
| Other|       other|
|  Male|        male|
|Female|      female|
| Other|       other|
+------+------------+
only showing top 20 rows


In [0]:
data.withColumn("gender length", length(data['Gender'])).select("Gender", "gender length").show()

+------+-------------+
|Gender|gender length|
+------+-------------+
|  Male|            4|
| Other|            5|
|Female|            6|
|  Male|            4|
|  Male|            4|
|  Male|            4|
|Female|            6|
| Other|            5|
|Female|            6|
| Other|            5|
| Other|            5|
|Female|            6|
|  Male|            4|
|  Male|            4|
|Female|            6|
|  Male|            4|
| Other|            5|
|  Male|            4|
|Female|            6|
| Other|            5|
+------+-------------+
only showing top 20 rows


In [0]:
## Date Time
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub

current_date

<function pyspark.sql.functions.builtin.current_date() -> pyspark.sql.column.Column>

In [0]:
data.withColumn("Current Date", current_date()).select("Age", "Current Date").show()

+---+------------+
|Age|Current Date|
+---+------------+
| 53|  2025-06-17|
| 60|  2025-06-17|
| 37|  2025-06-17|
| 32|  2025-06-17|
| 16|  2025-06-17|
| 21|  2025-06-17|
| 57|  2025-06-17|
| 56|  2025-06-17|
| 46|  2025-06-17|
| 44|  2025-06-17|
| 55|  2025-06-17|
| 41|  2025-06-17|
| 53|  2025-06-17|
| 35|  2025-06-17|
| 33|  2025-06-17|
| 52|  2025-06-17|
| 46|  2025-06-17|
| 54|  2025-06-17|
| 50|  2025-06-17|
| 40|  2025-06-17|
+---+------------+
only showing top 20 rows


In [0]:
data.withColumn("Current TimeStamp", current_timestamp()).select("Age", "Current TimeStamp").show()

+---+--------------------+
|Age|   Current TimeStamp|
+---+--------------------+
| 53|2025-06-17 16:04:...|
| 60|2025-06-17 16:04:...|
| 37|2025-06-17 16:04:...|
| 32|2025-06-17 16:04:...|
| 16|2025-06-17 16:04:...|
| 21|2025-06-17 16:04:...|
| 57|2025-06-17 16:04:...|
| 56|2025-06-17 16:04:...|
| 46|2025-06-17 16:04:...|
| 44|2025-06-17 16:04:...|
| 55|2025-06-17 16:04:...|
| 41|2025-06-17 16:04:...|
| 53|2025-06-17 16:04:...|
| 35|2025-06-17 16:04:...|
| 33|2025-06-17 16:04:...|
| 52|2025-06-17 16:04:...|
| 46|2025-06-17 16:04:...|
| 54|2025-06-17 16:04:...|
| 50|2025-06-17 16:04:...|
| 40|2025-06-17 16:04:...|
+---+--------------------+
only showing top 20 rows


In [0]:
data.withColumn("Future_date", date_add(current_date(), 10)).select("Age", "Future_date").show()
## Adding 10 days to current date


+---+-----------+
|Age|Future_date|
+---+-----------+
| 53| 2025-06-27|
| 60| 2025-06-27|
| 37| 2025-06-27|
| 32| 2025-06-27|
| 16| 2025-06-27|
| 21| 2025-06-27|
| 57| 2025-06-27|
| 56| 2025-06-27|
| 46| 2025-06-27|
| 44| 2025-06-27|
| 55| 2025-06-27|
| 41| 2025-06-27|
| 53| 2025-06-27|
| 35| 2025-06-27|
| 33| 2025-06-27|
| 52| 2025-06-27|
| 46| 2025-06-27|
| 54| 2025-06-27|
| 50| 2025-06-27|
| 40| 2025-06-27|
+---+-----------+
only showing top 20 rows


In [0]:
data.withColumn("Past_date", date_sub(current_date(), 10)).select("Age", "Past_date").show()

+---+----------+
|Age| Past_date|
+---+----------+
| 53|2025-06-07|
| 60|2025-06-07|
| 37|2025-06-07|
| 32|2025-06-07|
| 16|2025-06-07|
| 21|2025-06-07|
| 57|2025-06-07|
| 56|2025-06-07|
| 46|2025-06-07|
| 44|2025-06-07|
| 55|2025-06-07|
| 41|2025-06-07|
| 53|2025-06-07|
| 35|2025-06-07|
| 33|2025-06-07|
| 52|2025-06-07|
| 46|2025-06-07|
| 54|2025-06-07|
| 50|2025-06-07|
| 40|2025-06-07|
+---+----------+
only showing top 20 rows
