<a href="https://colab.research.google.com/github/Dobby-Mphahlele/Problem-set-2/blob/main/Dobby_VoIP_Call_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install pyspark

In [None]:
!pip install pyspark


### 1. Loading the data

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, sum as spark_sum, min as spark_min, max as spark_max


# Initialize Spark session
spark = SparkSession.builder.appName("IPDR Analysis").getOrCreate()

# GitHub raw file URL
url = 'https://raw.githubusercontent.com/Dobby-Mphahlele/Problem-set-2/1b478910cfb09c6768b5ebb490fe80c302d4b5ba/ipdr.csv'

# Reading the CSV file into a Pandas DataFrame
df_pd = pd.read_csv(url)

# Converting Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(df_pd)

# Showing the schema and a few rows of the Spark DataFrame
spark_df.printSchema()
spark_df.show(5)


Converting datetime columns to timestamp type

In [None]:

spark_df = spark_df.withColumn("ST", unix_timestamp(col("starttime"), "yyyy-MM-ddHH:mm:ss").cast("timestamp"))
spark_df = spark_df.withColumn("ET", unix_timestamp(col("endtime"), "yyyy-MM-ddHH:mm:ss").cast("timestamp"))

# Show the updated DataFrame
spark_df.dtypes


# Selecting each MSISDN and specific start and end datetime domain/app wise:

In [None]:
# Group by MSISDN, domain, and VoIP APP to identify each call
call_df = spark_df.groupBy("msisdn", "domain").agg(
    spark_min("starttime").alias("First_ST"),
    spark_max("endtime").alias("Last_ET"),
    spark_sum("dlvolume").alias("Total_DL_Volume"),
    spark_sum("ulvolume").alias("Total_UL_Volume")
)

# Show the aggregated data
call_df.show()


### Calculating ET(ET-10 min) for each FDR and handling idle time exclusion

In [None]:
from pyspark.sql.functions import expr, col

# Calculate ET - 10 minutes, handling nulls with COALESCE
call_df = call_df.withColumn("ET_minus_10min", expr("COALESCE(Last_ET - interval 10 minutes, Last_ET)"))

# If ET-10 min < ST, keep the original ET
call_df = call_df.withColumn("Final_ET", expr("CASE WHEN COALESCE(ET_minus_10min, Last_ET) < First_ST THEN Last_ET ELSE ET_minus_10min END"))

# Show the updated DataFrame
call_df.show()



### Calculating total volume of each call in Kb

In [None]:
# Calculating total volume in Kb (since UL and DL volumes are in bytes)
call_df = call_df.withColumn("Total_Volume_Kb", (col("Total_DL_Volume") + col("Total_UL_Volume")) / 1024)

# Registering ipdr as a temporary view
call_df.createOrReplaceTempView('ipdr')
# Showing the updated DataFrame
call_df.show()



### Calculating total time of each call in seconds

In [None]:
from pyspark.sql.functions import unix_timestamp, col

# Calculate total time in seconds
total_time_df = spark.sql("""
  SELECT
    *,
    (UNIX_TIMESTAMP(Last_ET, "yyyy-MM-ddHH:mm:ss") - UNIX_TIMESTAMP(First_ST, "yyyy-MM-ddHH:mm:ss")) AS Total_Time_Sec
  FROM ipdr
""")
total_time_df.createOrReplaceTempView("volume_time_ipdr")
total_time_df.show()






### Calculating bit rate (kbps) of each call

In [None]:
bit_rate_df = spark.sql(
    """
    SELECT
      *,
    ((Total_Volume_Kb) / (Total_Time_Sec)) * 1000  AS Bit_Rate_Kbps
    FROM volume_time_ipdr
  """
)
bit_rate_df.createOrReplaceTempView("bit_rate_ipdr")
bit_rate_df.show()

### Identification of Audio or Video call and its count

In [None]:
result_df = spark.sql("""
SELECT
    *,
    CASE
        WHEN Bit_Rate_Kbps <= 200 THEN 'Yes'
        ELSE 'No'
    END AS isAudio,
    CASE
        WHEN Bit_Rate_Kbps > 200 THEN 'Yes'
        ELSE 'No'
    END AS isVideo
FROM bit_rate_ipdr
WHERE Bit_Rate_Kbps >= 10  --- Filtering out calls with bit rate < 10 kbps
"""

)
result_df.show()

### Results

In [None]:

result_df.select("msisdn", "domain", "Total_Time_Sec", "Total_Volume_Kb", "Bit_Rate_kbps", "isAudio", "isVideo")
result_df.show(truncate=False)



# Final Output

In [None]:
# Registering final_df as a temporary view
result_df.createOrReplaceTempView("result_df")

# Perform the aggregation using Spark SQL
final_df = spark.sql("""
  SELECT
    msisdn,
    domain,
    Total_Time_Sec,
    Total_Volume_Kb,
    Bit_Rate_kbps as kbps,
    COUNT(*) OVER (PARTITION BY msisdn) AS fdr_count,
    isAudio,
    isVideo
  FROM result_df
  GROUP BY msisdn, domain, Total_Time_Sec, Total_Volume_Kb, Bit_Rate_kbps, isAudio, isVideo
""")

final_df.show(truncate=False)


