<a href="https://colab.research.google.com/github/Dobby-Mphahlele/Problem-set-2/blob/main/solution_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=46d26a33808e4d2085b3d1e8846be797a39aeaf92d5ecc4d69981c4b6500b70a
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


Initialize Spark session

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Read IPDR CSV into DataFrame") \
    .getOrCreate()

Read CSV file into PySpark DataFrame

In [37]:
csv_file_path = 'ipdr.csv'

df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the first few rows of the DataFrame
df.show()

+------------------+------------------+------+--------+--------+------+
|         starttime|           endtime|msisdn|ulvolume|dlvolume|domain|
+------------------+------------------+------+--------+--------+------+
|2021-04-0212:23:10|2021-04-0212:24:48|     1|   10819|    9960|  app1|
|2021-04-0212:28:56|2021-04-0212:33:12|     1|   16067|   10663|  app1|
|2021-04-0216:24:21|2021-04-0216:38:28|     2|    1173|    4265|  app1|
|2021-04-0215:08:57|2021-04-0215:20:56|     2|    1200|     192|  app2|
|2021-04-0215:08:57|2021-04-0215:20:41|     2|  175130|  101657|  app2|
|2021-04-0215:09:22|2021-04-0215:21:02|     2|    1440|       0|  app2|
|2021-04-0215:09:22|2021-04-0215:20:35|     2|    3672|    1152|  app2|
|2021-04-0215:09:22|2021-04-0215:21:08|     2|    1440|       0|  app2|
|2021-04-0215:54:46|2021-04-0216:11:49|     3|   10932|    5504|  app3|
|2021-04-0215:54:46|2021-04-0215:55:34|     3|     500|     580|  app3|
|2021-04-0215:54:46|2021-04-0215:54:47|     3|     592|     656|

# Selecting each MSISDN and specific start and end datetime domain/app-wise.

In [38]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, col
from pyspark.sql.types import TimestampType

# Create a temporary view for SQL queries
df.createOrReplaceTempView("ipdr")

# Select each MSISDN and specific start and end datetime domain/app-wise
query = """
SELECT
    msisdn,
    domain,
    MIN(starttime) AS Start_Time,
    MAX(endtime) AS End_Time,
    SUM(dlvolume) AS Total_DL_Volume,
    SUM(ulvolume) AS Total_UL_Volume
FROM ipdr
GROUP BY msisdn, domain
"""
grouped_df = spark.sql(query)

# Convert Start_Time and End_Time columns to TimestampType
grouped_df = grouped_df.withColumn("Start_Time", from_unixtime(unix_timestamp(col("Start_Time"), "yyyy-MM-ddHH:mm:ss"), "yyyy-MM-dd HH:mm:ss").cast(TimestampType()))
grouped_df = grouped_df.withColumn("End_Time", from_unixtime(unix_timestamp(col("End_Time"), "yyyy-MM-ddHH:mm:ss"), "yyyy-MM-dd HH:mm:ss").cast(TimestampType()))

# Register the DataFrame as a temp view
grouped_df.createOrReplaceTempView("grouped_ipdr")

# Show the result
grouped_df.show()






+------+------+-------------------+-------------------+---------------+---------------+
|msisdn|domain|         Start_Time|           End_Time|Total_DL_Volume|Total_UL_Volume|
+------+------+-------------------+-------------------+---------------+---------------+
|     1|  app1|2021-04-02 12:23:10|2021-04-02 12:33:12|          20623|          26886|
|     2|  app1|2021-04-02 16:24:21|2021-04-02 16:38:28|           4265|           1173|
|     2|  app2|2021-04-02 15:08:57|2021-04-02 15:21:08|         103001|         182882|
|     3|  app3|2021-04-02 15:54:46|2021-04-02 16:11:49|        1801756|        2104664|
|     3|  app4|2021-04-02 12:13:11|2021-04-02 17:57:03|        8287876|        8034960|
|     4|  app3|2021-04-02 12:14:42|2021-04-02 21:04:25|         415065|         667564|
|     4|  app4|2021-04-02 22:27:36|2021-04-02 22:34:55|          33656|         218076|
+------+------+-------------------+-------------------+---------------+---------------+



## Calculate start time (ST), end time (ET), and adjusted ET.

In [31]:
# Calculate start time (ST), end time (ET), and adjusted ET
# Calculate adjusted end time (Adjusted_ET)
adjusted_query = """
SELECT
    *,
    unix_timestamp(End_Time) AS Unix_End_Time,
    unix_timestamp(Start_Time) AS Unix_Start_Time,
    CASE
        WHEN unix_timestamp(End_Time) - 600 < unix_timestamp(Start_Time) THEN End_Time
        ELSE from_unixtime(unix_timestamp(End_Time) - 600)
    END AS Adjusted_ET
FROM grouped_ipdr
"""
adjusted_df = spark.sql(adjusted_query)
adjusted_df.createOrReplaceTempView("adjusted_ipdr")
adjusted_df.show()


+------+------+-------------------+-------------------+---------------+---------------+-------------+---------------+-------------------+
|msisdn|domain|         Start_Time|           End_Time|Total_DL_Volume|Total_UL_Volume|Unix_End_Time|Unix_Start_Time|        Adjusted_ET|
+------+------+-------------------+-------------------+---------------+---------------+-------------+---------------+-------------------+
|     1|  app1|2021-04-02 12:23:10|2021-04-02 12:33:12|          20623|          26886|   1617366792|     1617366190|2021-04-02 12:23:12|
|     2|  app1|2021-04-02 16:24:21|2021-04-02 16:38:28|           4265|           1173|   1617381508|     1617380661|2021-04-02 16:28:28|
|     2|  app2|2021-04-02 15:08:57|2021-04-02 15:21:08|         103001|         182882|   1617376868|     1617376137|2021-04-02 15:11:08|
|     3|  app3|2021-04-02 15:54:46|2021-04-02 16:11:49|        1801756|        2104664|   1617379909|     1617378886|2021-04-02 16:01:49|
|     3|  app4|2021-04-02 12:13:11

## Calculating total volume in Kb, total time in seconds, and bit rate in kbps.

In [32]:
from pyspark.sql.functions import unix_timestamp, col

# Calculate total volume in Kb and total time in seconds
query = """
SELECT
    *,
    (Total_DL_Volume + Total_UL_Volume) / 1024 AS Total_Volume_Kb,
    (UNIX_TIMESTAMP(End_Time, "yyyy-MM-ddHH:mm:ss") - UNIX_TIMESTAMP(Start_Time, "yyyy-MM-ddHH:mm:ss")) AS Total_Time_Sec
FROM adjusted_ipdr
"""
volume_time_df = spark.sql(query)
volume_time_df.createOrReplaceTempView("volume_time_ipdr")
volume_time_df.show()

# Calculate bit rate in kbps
query = """
SELECT
    *,
    (Total_Volume_Kb / Total_Time_Sec) * 8 AS Bit_Rate_Kbps
FROM volume_time_ipdr
"""
bit_rate_df = spark.sql(query)
bit_rate_df.createOrReplaceTempView("bit_rate_ipdr")
bit_rate_df.show()



+------+------+-------------------+-------------------+---------------+---------------+-------------+---------------+-------------------+---------------+--------------+
|msisdn|domain|         Start_Time|           End_Time|Total_DL_Volume|Total_UL_Volume|Unix_End_Time|Unix_Start_Time|        Adjusted_ET|Total_Volume_Kb|Total_Time_Sec|
+------+------+-------------------+-------------------+---------------+---------------+-------------+---------------+-------------------+---------------+--------------+
|     1|  app1|2021-04-02 12:23:10|2021-04-02 12:33:12|          20623|          26886|   1617366792|     1617366190|2021-04-02 12:23:12|  46.3955078125|           602|
|     2|  app1|2021-04-02 16:24:21|2021-04-02 16:38:28|           4265|           1173|   1617381508|     1617380661|2021-04-02 16:28:28|    5.310546875|           847|
|     2|  app2|2021-04-02 15:08:57|2021-04-02 15:21:08|         103001|         182882|   1617376868|     1617376137|2021-04-02 15:11:08| 279.1826171875|  

## Identifying audio and video calls and their counts.

In [34]:
# Identify audio and video calls and their counts
query = """
SELECT
    *,
    CASE
        WHEN Bit_Rate_Kbps <= 200 THEN 1
        ELSE 0
    END AS isAudio,
    CASE
        WHEN Bit_Rate_Kbps > 200 THEN 1
        ELSE 0
    END AS isVideo
FROM bit_rate_ipdr
"""
final_df = spark.sql(query)
final_df.show()



+------+------+-------------------+-------------------+---------------+---------------+-------------+---------------+-------------------+---------------+--------------+-------------------+-------+-------+
|msisdn|domain|         Start_Time|           End_Time|Total_DL_Volume|Total_UL_Volume|Unix_End_Time|Unix_Start_Time|        Adjusted_ET|Total_Volume_Kb|Total_Time_Sec|      Bit_Rate_Kbps|isAudio|isVideo|
+------+------+-------------------+-------------------+---------------+---------------+-------------+---------------+-------------------+---------------+--------------+-------------------+-------+-------+
|     1|  app1|2021-04-02 12:23:10|2021-04-02 12:33:12|          20623|          26886|   1617366792|     1617366190|2021-04-02 12:23:12|  46.3955078125|           602| 0.6165515988372093|      1|      0|
|     2|  app1|2021-04-02 16:24:21|2021-04-02 16:38:28|           4265|           1173|   1617381508|     1617380661|2021-04-02 16:28:28|    5.310546875|           847| 0.050158648