# Read CSV files of FWM Set top box (STB) data into dataframes

Read a file list into a DF, write it to a parquet file .

Noam 2023-02-06


In [40]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import*

spark = SparkSession.builder.appName('fwm').getOrCreate()
sc = spark.sparkContext

In [41]:
# Read a CSV into a dataframe
def load_PD_file(filename, schema) :
    dataPath = "/datasets/Fourthwall_STB_data/" + filename
    df = spark.read.format("csv")\
      .option("header","false")\
      .option("delimiter", "|")\
      .schema(schema)\
      .load(dataPath)
    return df
  
# inferSchema means we will automatically figure out column types 
# at a cost of reading the data more than once

In [67]:
schema =  StructType([StructField('prog_code',StringType()),
                     StructField('title',StringType()),
                     StructField('genre',StringType()),
                     StructField('air_date',StringType()),
                     StructField('air_time',StringType()),
                     StructField('Duration',FloatType())
                                       ])
x = load_PD_file("SintecMedia.rpt_programs.date_2015-12-30.2016-11-29.pd", schema  )
              

In [98]:
from pyspark.sql.functions import col,to_date,udf
def strtime_to_fload(val:str)-> float:
    h = 66# int(val)//10000
    #m = h /100
    return h
to_float_time = udf(strtime_to_fload, FloatType())

x.printSchema()
x2 = x.withColumn('airdate',to_date(col('air_date'),'yyyymmdd'))\
      .withColumn('airtime',to_float_time(col('air_time')))\
      .drop('air_date')
      #.drop('air_time')
x2.printSchema()
x2.show(5)

root
 |-- prog_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- air_date: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- Duration: float (nullable = true)

root
 |-- prog_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- Duration: float (nullable = true)
 |-- airdate: date (nullable = true)
 |-- airtime: float (nullable = true)

+--------------+-----------------+-----------+--------+--------+----------+-------+
|     prog_code|            title|      genre|air_time|Duration|   airdate|airtime|
+--------------+-----------------+-----------+--------+--------+----------+-------+
|EP000000510016|A Different World|     Sitcom|  140000|    30.0|2015-01-30|   null|
|EP000000510017|A Different World|     Sitcom|  143000|    30.0|2015-01-30|   null|
|EP000000510027|A Different World|     Sitcom|  220000|    30.0|2

In [69]:
y = load_PD_file("FWM_20151229_R.pd")
y.printSchema()
y.show()

TypeError: load_PD_file() missing 1 required positional argument: 'schema'

In [23]:
%%time
z = load_PD_file("SintecMedia.rpt_prog_view.date_2015-12-26.2016-11-29.pd")
#z.printSchema()
z = z.toDF(*['MSO','device_id','playback_date','unk','unk2', 'station_num'])
#z.show()



CPU times: user 6.03 ms, sys: 942 µs, total: 6.97 ms
Wall time: 7.13 s


                                                                                

In [25]:
%%time 
z.write.parquet("../data/prog_view.parquet")


23/02/08 18:46:49 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers

CPU times: user 5.64 ms, sys: 4.84 ms, total: 10.5 ms
Wall time: 12.2 s


                                                                                

In [63]:
del z
del y
del x


NameError: name 'z' is not defined

In [30]:
%%time 
z = spark.read.parquet("../data/prog_view.parquet")

CPU times: user 0 ns, sys: 2.23 ms, total: 2.23 ms
Wall time: 71.5 ms


In [35]:
%%time
z.count()
z.show(10)

+----+------------+-------------+------+-----+--------------+
| MSO|   device_id|playback_date|   unk| unk2|   station_num|
+----+------------+-------------+------+-----+--------------+
|8360|001bd75e12e3|     20151226| 63440|11713|SH007227490000|
|8360|001bd75e12e3|     20151226| 70000|11713|SH007227490000|
|8360|001bd75e12e3|     20151226| 80000|11713|EP000018937336|
|8360|001bd75e12e3|     20151226|182527|11713|EP000009937693|
|8360|001bd75e12e3|     20151226|182608|49603|EP021213560001|
|8360|001bd75e12e3|     20151226|183000|49603|EP021213560002|
|8360|001bd75e12e3|     20151226|184055|14902|EP017277730029|
|8360|001bd75e12e3|     20151226|190000|14902|EP020481370022|
|8360|001bd75e12e3|     20151226|193000|14902|EP020481370014|
|8360|001bd75e12e3|     20151226|195448|14909|EP013320550286|
+----+------------+-------------+------+-----+--------------+
only showing top 10 rows

CPU times: user 2.71 ms, sys: 0 ns, total: 2.71 ms
Wall time: 221 ms


In [33]:
z

DataFrame[MSO: int, device_id: string, playback_date: int, unk: int, unk2: int, station_num: string]