# Extract fileds

In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as f
from pyspark import SparkContext
from pyspark.sql import SparkSession
 
spark = SparkSession.builder.appName("project1_part1").getOrCreate()
sc = spark.sparkContext

In [0]:
# Read a CSV into a dataframe

def load_csv_file(filename, schema):
  # Reads the relevant file from distributed file system using the given schema

  allowed_files = {'Daily program data': ('Daily program data', "|"),
                   'demographic': ('demographic', "|")}

  if filename not in allowed_files.keys():
    print(f'You were trying to access unknown file \"{filename}\". Only valid options are {allowed_files.keys()}')
    return None

  filepath = allowed_files[filename][0]
  dataPath = f"dbfs:/mnt/coursedata2024/fwm-stb-data/{filepath}"
  delimiter = allowed_files[filename][1]

  df = spark.read.format("csv")\
    .option("header","false")\
    .option("delimiter",delimiter)\
    .schema(schema)\
    .load(dataPath)
  return df

# This dict holds the correct schemata for easily loading the CSVs
schemas_dict = {'Daily program data':
                  StructType([
                    StructField('prog_code', StringType()),
                    StructField('title', StringType()),
                    StructField('genre', StringType()),
                    StructField('air_date', StringType()),
                    StructField('air_time', StringType()),
                    StructField('Duration', FloatType())
                  ]),
                'viewing':
                  StructType([
                    StructField('device_id', StringType()),
                    StructField('event_date', StringType()),
                    StructField('event_time', IntegerType()),
                    StructField('mso_code', StringType()),
                    StructField('prog_code', StringType()),
                    StructField('station_num', StringType())
                  ]),
                'viewing_full':
                  StructType([
                    StructField('mso_code', StringType()),
                    StructField('device_id', StringType()),
                    StructField('event_date', IntegerType()),
                    StructField('event_time', IntegerType()),
                    StructField('station_num', StringType()),
                    StructField('prog_code', StringType())
                  ]),
                'demographic':
                  StructType([StructField('household_id',StringType()),
                    StructField('household_size',IntegerType()),
                    StructField('num_adults',IntegerType()),
                    StructField('num_generations',IntegerType()),
                    StructField('adult_range',StringType()),
                    StructField('marital_status',StringType()),
                    StructField('race_code',StringType()),
                    StructField('presence_children',StringType()),
                    StructField('num_children',IntegerType()),
                    StructField('age_children',StringType()), #format like range - 'bitwise'
                    StructField('age_range_children',StringType()),
                    StructField('dwelling_type',StringType()),
                    StructField('home_owner_status',StringType()),
                    StructField('length_residence',IntegerType()),
                    StructField('home_market_value',StringType()),
                    StructField('num_vehicles',IntegerType()),
                    StructField('vehicle_make',StringType()),
                    StructField('vehicle_model',StringType()),
                    StructField('vehicle_year',IntegerType()),
                    StructField('net_worth',IntegerType()),
                    StructField('income',StringType()),
                    StructField('gender_individual',StringType()),
                    StructField('age_individual',IntegerType()),
                    StructField('education_highest',StringType()),
                    StructField('occupation_highest',StringType()),
                    StructField('education_1',StringType()),
                    StructField('occupation_1',StringType()),
                    StructField('age_2',IntegerType()),
                    StructField('education_2',StringType()),
                    StructField('occupation_2',StringType()),
                    StructField('age_3',IntegerType()),
                    StructField('education_3',StringType()),
                    StructField('occupation_3',StringType()),
                    StructField('age_4',IntegerType()),
                    StructField('education_4',StringType()),
                    StructField('occupation_4',StringType()),
                    StructField('age_5',IntegerType()),
                    StructField('education_5',StringType()),
                    StructField('occupation_5',StringType()),
                    StructField('polit_party_regist',StringType()),
                    StructField('polit_party_input',StringType()),
                    StructField('household_clusters',StringType()),
                    StructField('insurance_groups',StringType()),
                    StructField('financial_groups',StringType()),
                    StructField('green_living',StringType())
                  ])
}

# Demographic data

In [0]:
%%time
# demographic data filename is 'demographic'
demo_df = load_csv_file('demographic', schemas_dict['demographic'])

demo_df.printSchema()
print(f'demo_df contains {demo_df.count()} records!')
display(demo_df.limit(6))

root
 |-- household_id: string (nullable = true)
 |-- household_size: integer (nullable = true)
 |-- num_adults: integer (nullable = true)
 |-- num_generations: integer (nullable = true)
 |-- adult_range: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- race_code: string (nullable = true)
 |-- presence_children: string (nullable = true)
 |-- num_children: integer (nullable = true)
 |-- age_children: string (nullable = true)
 |-- age_range_children: string (nullable = true)
 |-- dwelling_type: string (nullable = true)
 |-- home_owner_status: string (nullable = true)
 |-- length_residence: integer (nullable = true)
 |-- home_market_value: string (nullable = true)
 |-- num_vehicles: integer (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_model: string (nullable = true)
 |-- vehicle_year: integer (nullable = true)
 |-- net_worth: integer (nullable = true)
 |-- income: string (nullable = true)
 |-- gender_individual: string (nullable = t

household_id,household_size,num_adults,num_generations,adult_range,marital_status,race_code,presence_children,num_children,age_children,age_range_children,dwelling_type,home_owner_status,length_residence,home_market_value,num_vehicles,vehicle_make,vehicle_model,vehicle_year,net_worth,income,gender_individual,age_individual,education_highest,occupation_highest,education_1,occupation_1,age_2,education_2,occupation_2,age_3,education_3,occupation_3,age_4,education_4,occupation_4,age_5,education_5,occupation_5,polit_party_regist,polit_party_input,household_clusters,insurance_groups,financial_groups,green_living
15,2.0,2.0,1.0,100000000,S,B,,,0,0,S,O,5.0,E,,,,,6.0,4.0,M,60.0,4.0,,,,,,,,,,,,,,,,,D,443,02C3,08C3,
24,2.0,2.0,1.0,100000000000,,W,,,0,0,M,O,,F,,,,,7.0,7.0,F,46.0,3.0,Z,,,,,,,,,,,,,,,,R,223,09O3,03O3,
26,,,,0,,,,,0,0,S,,,F,,,,,,,,,,,,,,,,,,,,,,,,,,,46G,04CG,08CG,
28,3.0,2.0,2.0,110000000000000,S,W,Y,1.0,10000000000000,1000000000,S,O,3.0,H,,,,,5.0,7.0,M,38.0,2.0,4,,,34.0,1.0,7.0,,,,,,,,,,,V,473,11R3,09C3,1.0
35,1.0,1.0,1.0,100000000000,,W,,,0,0,,,,G,,,,,4.0,,M,50.0,2.0,1,,,,,,,,,,,,,,,,D,523,13C3,08C3,
36,,,,0,,,,,0,0,,,,G,,,,,,,,,,,,,,,,,,,,,,,,,,,51G,10RG,10RG,


CPU times: user 13.1 ms, sys: 259 µs, total: 13.3 ms
Wall time: 1.75 s


In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql.functions import col, count, expr, when

#remove unwanted columns from demographic data
demo_df_col = ['household_id', 'household_size', 'num_adults', 'net_worth', 'income', 'green_living']
demo_df = demo_df.select(*demo_df_col)

#remove duplicates
demo_df = demo_df.dropDuplicates(["household_id"])

#add column representing the income column cast to int
demo_df = demo_df.withColumn(
    "income_int",
    expr("""
        CASE income
            WHEN 'A' THEN 10
            WHEN 'B' THEN 11
            WHEN 'C' THEN 12
            WHEN 'D' THEN 13
            ELSE CAST(income AS INT)
        END
    """)
)



demographic data after changes

In [0]:
demo_df.printSchema()
demo_df.show(10)


root
 |-- household_id: string (nullable = true)
 |-- household_size: integer (nullable = true)
 |-- num_adults: integer (nullable = true)
 |-- net_worth: integer (nullable = true)
 |-- income: string (nullable = true)
 |-- green_living: string (nullable = true)
 |-- income_int: integer (nullable = true)

+------------+--------------+----------+---------+------+------------+----------+
|household_id|household_size|num_adults|net_worth|income|green_living|income_int|
+------------+--------------+----------+---------+------+------------+----------+
|    00000122|             3|         2|        7|     5|           1|         5|
|    00000145|             1|         1|        5|  NULL|           1|      NULL|
|    00000169|          NULL|      NULL|     NULL|  NULL|        NULL|      NULL|
|    00000228|             3|         2|        6|     4|           1|         4|
|    00000238|             2|         1|        6|     8|        NULL|         8|
|    00000265|          NULL|      NU

# Daily program data

In [0]:
%%time
# daily_program data filename is 'Daily program data'
daily_prog_df = load_csv_file('Daily program data', schemas_dict['Daily program data'])

daily_prog_df.printSchema()
print(f'daily_prog_df contains {daily_prog_df.count()} records!')
display(daily_prog_df)


root
 |-- prog_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- air_date: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- Duration: float (nullable = true)

daily_prog_df contains 13194849 records!


prog_code,title,genre,air_date,air_time,Duration
EP000000250035,21 Jump Street,Crime drama,20151219,50000,60.0
EP000000250035,21 Jump Street,Crime drama,20151219,110000,60.0
EP000000250063,21 Jump Street,Crime drama,20151219,180000,60.0
EP000000510007,A Different World,Sitcom,20151219,100000,30.0
EP000000510008,A Different World,Sitcom,20151219,103000,30.0
EP000000510159,A Different World,Sitcom,20151219,80300,29.0
EP000000510159,A Different World,Sitcom,20151219,110300,29.0
EP000000510161,A Different World,Sitcom,20151219,83200,28.0
EP000000510161,A Different World,Sitcom,20151219,113200,28.0
EP000000510167,A Different World,Sitcom,20151219,93000,30.0


CPU times: user 101 ms, sys: 4.26 ms, total: 105 ms
Wall time: 6.13 s


In [0]:
# test if theres more than one title per prog code
from pyspark.sql.functions import collect_set, size

# Group by prog_code and collect unique genres
title_check_df = daily_prog_df.groupBy("prog_code").agg(collect_set("title").alias("unique_title"))

# Filter prog_code groups with more than one unique genre
inconsistent_title_df = title_check_df.filter(size("unique_title") > 1)

# Show results
display(inconsistent_title_df)

prog_code,unique_title
EP000048400390,"List(Woodwright's Shop, The Woodwright's Shop)"
EP000048400392,"List(Woodwright's Shop, The Woodwright's Shop)"
EP000048400400,"List(Woodwright's Shop, The Woodwright's Shop)"
EP000048400405,"List(Woodwright's Shop, The Woodwright's Shop)"
EP002519530842,"List(Maravillas modernas, Maravillas Modernas)"
EP002519530845,"List(Maravillas modernas, Maravillas Modernas)"
EP004650510120,"List(Mundos perdidos, Mundos Perdidos)"
EP005633420087,"List(NBA Hardwood Classics, Hardwood Classics)"
EP005633420668,"List(NBA Hardwood Classics, Hardwood Classics)"
EP005633420727,"List(NBA Hardwood Classics, Hardwood Classics)"


In [0]:

# Ensures that each prog_code has a consistent title value based on the most frequent title.

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc, col

# Group by prog_code and title to get the count of each title for each prog_code
title_counts_df = daily_prog_df.groupBy("prog_code", "title").count()

# Define a window partitioned by prog_code and ordered by count descending
window_spec = Window.partitionBy("prog_code").orderBy(desc("count"))

# Add a row number within each partition to identify the most frequent title
ranked_title_df = title_counts_df.withColumn("rank", row_number().over(window_spec))

# Filter to keep only the most frequent title for each prog_code
most_frequent_title_df = ranked_title_df.filter(col("rank") == 1).select("prog_code", "title")

# Join the result back to the original DataFrame to tie-break the rows
daily_prog_df = daily_prog_df.drop("title").join(most_frequent_title_df, on="prog_code", how="left")

# Show the final DataFrame with resolved title
daily_prog_df.show()

+--------------+-----------+--------+--------+--------+-----------------+
|     prog_code|      genre|air_date|air_time|Duration|            title|
+--------------+-----------+--------+--------+--------+-----------------+
|EP000000250063|Crime drama|20151219|  180000|    60.0|   21 Jump Street|
|EP000000510180|     Sitcom|20151219|  103000|    30.0|A Different World|
|EP000000510180|     Sitcom|20151219|  133000|    30.0|A Different World|
|EP000001150004|     Sitcom|20151219|  093000|    30.0|The Addams Family|
|EP000000510179|     Sitcom|20151219|  100000|    30.0|A Different World|
|EP000000510179|     Sitcom|20151219|  130000|    30.0|A Different World|
|EP000001150034|     Sitcom|20151219|  103000|    30.0|The Addams Family|
|EP000001150023|     Sitcom|20151219|  100000|    30.0|The Addams Family|
|EP000000510159|     Sitcom|20151219|  080300|    29.0|A Different World|
|EP000000510159|     Sitcom|20151219|  110300|    29.0|A Different World|
|EP000000510161|     Sitcom|20151219| 

In [0]:
# run this cell, to test if theres more than one title per prog code
from pyspark.sql.functions import collect_set, size

# Group by prog_code and collect unique genres
title_check_df = daily_prog_df.groupBy("prog_code").agg(collect_set("title").alias("unique_title"))

# Filter prog_code groups with more than one unique genre
inconsistent_title_df = title_check_df.filter(size("unique_title") > 1)

# Show results
display(inconsistent_title_df)

prog_code,unique_title


In [0]:
# run this cell, to test if theres more than one genre list per prog code
from pyspark.sql.functions import collect_set, size

# Group by prog_code and collect unique genres
genre_check_df = daily_prog_df.groupBy("prog_code").agg(collect_set("genre").alias("unique_genres"))

# Filter prog_code groups with more than one unique genre
inconsistent_genres_df = genre_check_df.filter(size("unique_genres") > 1)

# Show results
display(inconsistent_genres_df)

prog_code,unique_genres
EP000008460008,"List(Action,Adventure,Drama, Historical drama)"
EP000021400021,"List(Crime drama,Western, Drama,Western,Crime,Action)"
EP000021400040,"List(Crime drama,Western, Drama,Western,Crime,Action)"
EP000021400046,"List(Crime drama,Western, Drama,Western,Crime,Action)"
EP000021400048,"List(Crime drama,Western, Drama,Western,Crime,Action)"
EP000021400053,"List(Crime drama,Western, Drama,Western,Crime,Action)"
EP000025260018,"List(Drama,Western,Action, Western)"
EP000025260030,"List(Drama,Western,Action, Western)"
EP000025260041,"List(Drama,Western,Action, Western)"
EP000025260047,"List(Drama,Western,Action, Western)"


In [0]:

# Ensures that each prog_code has a consistent genre value based on the most frequent genre.

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc, col

# Group by prog_code and genre to get the count of each genre for each prog_code
genre_counts_df = daily_prog_df.groupBy("prog_code", "genre").count()

# Define a window partitioned by prog_code and ordered by count descending
window_spec = Window.partitionBy("prog_code").orderBy(desc("count"))

# Add a row number within each partition to identify the most frequent genre
ranked_genres_df = genre_counts_df.withColumn("rank", row_number().over(window_spec))

# Filter to keep only the most frequent genre for each prog_code
most_frequent_genre_df = ranked_genres_df.filter(col("rank") == 1).select("prog_code", "genre")

# Join the result back to the original DataFrame to tie-break the rows
daily_prog_df = daily_prog_df.drop("genre").join(most_frequent_genre_df, on="prog_code", how="left")

# Show the final DataFrame with resolved genres
daily_prog_df.show()

+--------------+--------+--------+--------+--------------------+-----------+
|     prog_code|air_date|air_time|Duration|               title|      genre|
+--------------+--------+--------+--------+--------------------+-----------+
|EP000000250063|20151219|  180000|    60.0|      21 Jump Street|Crime drama|
|EP000000260037|20150110|  023000|    30.0|                 227|     Sitcom|
|EP000000260037|20150110|  053000|    30.0|                 227|     Sitcom|
|EP000000260111|20151029|  010000|    30.0|                 227|     Sitcom|
|EP000000260111|20151029|  040000|    30.0|                 227|     Sitcom|
|EP000000510011|20151029|  130000|    30.0|   A Different World|     Sitcom|
|EP000000510113|20150406|  213000|    30.0|   A Different World|     Sitcom|
|EP000000510180|20151219|  103000|    30.0|   A Different World|     Sitcom|
|EP000000510180|20151219|  133000|    30.0|   A Different World|     Sitcom|
|EP000000860051|20150406|  090000|    30.0|The Abbott & Cost...|     Sitcom|

In [0]:
#check if we have the same genres list for each prog code after sorting
from pyspark.sql.functions import collect_set, size

# Group by prog_code and collect unique genres
genre_check_df_new = daily_prog_df.groupBy("prog_code").agg(collect_set("genre").alias("unique_genres"))

# Filter prog_code groups with more than one unique genre
inconsistent_genres_df_new = genre_check_df_new.filter(size("unique_genres") > 1)

#great job eden

# Show results
display(inconsistent_genres_df_new)

prog_code,unique_genres


In [0]:
# Ensures that each title has a consistent genre value based on the most frequent genre.

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc, col

# Group by title and genre to get the count of each genre for each title
genre_counts_df = daily_prog_df.groupBy("title", "genre").count()

# Define a window partitioned by title and ordered by count descending
window_spec = Window.partitionBy("title").orderBy(desc("count"))

# Add a row number within each partition to identify the most frequent genre
ranked_genres_df = genre_counts_df.withColumn("rank", row_number().over(window_spec))

# Filter to keep only the most frequent genre for each title
most_frequent_genre_df = ranked_genres_df.filter(col("rank") == 1).select("title", "genre")

# Join the result back to the original DataFrame to tie-break the rows
daily_prog_df = daily_prog_df.drop("genre").join(most_frequent_genre_df, on="title", how="left")

# Show the final DataFrame with resolved genres
display(daily_prog_df)

title,prog_code,air_date,air_time,Duration,genre
The Guns of Will Sonnett,EP000019710042,20151008,123000,30.0,"Drama,Western"
Lone Ranger,EP000026560093,20151008,130000,30.0,"Western,Adventure"
Lone Ranger,EP000026560095,20151008,133000,30.0,"Western,Adventure"
Lone Ranger,EP000026560173,20150406,130000,30.0,"Western,Adventure"
Lone Ranger,EP000026560195,20151219,213000,30.0,"Western,Adventure"
Lone Ranger,EP000026560195,20151220,110000,30.0,"Western,Adventure"
Lone Ranger,EP000026560195,20151220,140000,30.0,"Western,Adventure"
Lone Ranger,EP000026560235,20150816,100000,30.0,"Western,Adventure"
The Partridge Family,EP000033140047,20150816,80000,30.0,Sitcom
The Partridge Family,EP000033140074,20150816,140000,30.0,Sitcom


In [0]:
#check every title has the same list genre 

from pyspark.sql.functions import collect_set, size

# Group by prog_code and collect unique genres
genre_by_title = daily_prog_df.groupBy("title").agg(collect_set("genre").alias("unique_genres"))

# Filter prog_code groups with more than one unique genre
genre_by_title_df = genre_by_title.filter(size("unique_genres") > 1)

# Show results
display(genre_by_title_df)

title,unique_genres


In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql.functions import to_date, col, date_format, count, expr


#fix date and add day of the week column
daily_prog_df = daily_prog_df.withColumn("air_date", to_date(col("air_date"), "yyyyMMdd")) \
    .withColumn("air_time", col("air_time").cast("int"))  \
    .withColumn("day", date_format('air_date', 'EEE'))

# # Define the window specification to partition by 'prog_code'
# windowSpec = Window.partitionBy('prog_code').orderBy('air_date').rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

# # Add a column 'title' based on the first value in each group to fill null values by the common value
# daily_prog_df = daily_prog_df.withColumn(
#     'title', 
#     f.first('title', ignorenulls=True).over(windowSpec)
# )




13194849
12892195


In [0]:
#drop unnecessary column
daily_prog_df = daily_prog_df.drop("duration")
daily_prog_df = daily_prog_df.dropna().dropDuplicates()

Daily program data after changes

In [0]:
daily_prog_df.printSchema()
daily_prog_df.show(10)

root
 |-- title: string (nullable = true)
 |-- prog_code: string (nullable = true)
 |-- air_date: date (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- day: string (nullable = true)

+--------------------+--------------+----------+--------+-------------+---+
|               title|     prog_code|  air_date|air_time|        genre|day|
+--------------------+--------------+----------+--------+-------------+---+
|The Guns of Will ...|EP000019710004|2015-12-15|  133000|Drama,Western|Tue|
|The Guns of Will ...|EP000019710004|2015-08-20|  123000|Drama,Western|Thu|
|The Guns of Will ...|EP000019710004|2015-10-16|  123000|Drama,Western|Fri|
|The Guns of Will ...|EP000019710011|2015-12-20|  133000|Drama,Western|Sun|
|The Guns of Will ...|EP000019710011|2015-08-25|  123000|Drama,Western|Tue|
|The Guns of Will ...|EP000019710011|2015-10-21|  123000|Drama,Western|Wed|
|The Guns of Will ...|EP000019710017|2015-10-26|  123000|Drama,Western|Mon|
|The 

# Sample of 10 Million viewing entries

In [0]:
# Sample of 10 Million viewing entries

dataPath = f"dbfs:/viewing_10M"
viewing10m_df = spark.read.format("csv")\
    .option("header","true")\
    .option("delimiter",",")\
    .schema(schemas_dict['viewing_full'])\
    .load(dataPath)

display(viewing10m_df.limit(200))
viewing10m_df.printSchema()
print(f'viewing10m_df contains {viewing10m_df.count()} rows!')

mso_code,device_id,event_date,event_time,station_num,prog_code
1540,000000033afa,20151101,33000,67375,EP020820940009
1540,00000004e4b6,20151101,93000,42599,SP003189620000
1540,00000004eb8f,20151101,91856,42642,EP000176170270
1540,00000004f1d6,20151101,90206,68827,EP007961190099
1540,00000004f3c0,20151101,160658,10178,MV000259670000
1540,000000051ca0,20151101,174949,32645,EP001786120664
1540,0000000a040a,20151101,220000,42642,EP019234000013
1540,0000000a0554,20151101,100213,11150,EP014707060127
1540,0000000a0554,20151101,142942,30754,SP003189690000
1540,0000000a595a,20151101,111641,10269,EP010300650088


root
 |-- mso_code: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- event_date: integer (nullable = true)
 |-- event_time: integer (nullable = true)
 |-- station_num: string (nullable = true)
 |-- prog_code: string (nullable = true)

viewing10m_df contains 10042340 rows!


In [0]:
#fix date type
viewing10m_df = viewing10m_df.withColumn("event_date", to_date(col("event_date"), "yyyyMMdd")) 

#drop columns that are not needed for the analysis
cols_to_drop_viewing10m = ['mso_code', 'station_num']
viewing10m_df = viewing10m_df.drop(*cols_to_drop_viewing10m)




In [0]:
#remove records with prog_code that appear in viewing but not daily_prog
all_program_aired = daily_prog_df.select("prog_code").distinct()
viewing10m_df = viewing10m_df.join(all_program_aired, "prog_code", "inner")

Viewing data after changes

In [0]:

viewing10m_df.printSchema()
viewing10m_df.show(10)

root
 |-- prog_code: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- event_date: date (nullable = true)
 |-- event_time: integer (nullable = true)

+--------------+------------+----------+----------+
|     prog_code|   device_id|event_date|event_time|
+--------------+------------+----------+----------+
|EP019203190027|000001013e19|2015-11-01|    213000|
|EP022138670017|000001130671|2015-11-01|    141748|
|EP012538160102|00000176441e|2015-11-01|     23000|
|EP013216350305|000001ad9907|2015-11-01|    154500|
|EP011583840016|000002ddaf56|2015-11-01|      1725|
|SH005705730000|000004bada08|2015-11-01|     60000|
|SH005705730000|0000058c5dc7|2015-11-01|     74128|
|EP021711670054|800000be033e|2015-11-01|     90000|
|EP013216350305|800003754a82|2015-11-01|    154500|
|SH005705730000|800003b41280|2015-11-01|     50000|
+--------------+------------+----------+----------+
only showing top 10 rows



# Reference data

In [0]:
%%time
# reference data is stored in parquet for your convinence.

ref_df = spark.read.parquet('dbfs:/refxml_new_parquet')

ref_df.printSchema()
print(f'ref_df contains {ref_df.count()} records!')
ref_df.show(10)


root
 |-- device_id: string (nullable = true)
 |-- dma: string (nullable = true)
 |-- dma_code: long (nullable = true)
 |-- household_id: long (nullable = true)
 |-- household_type: string (nullable = true)
 |-- system_type: string (nullable = true)
 |-- zipcode: long (nullable = true)

ref_df contains 1268071 records!
+------------+------+--------+------------+--------------+-----------+-------+
|   device_id|   dma|dma_code|household_id|household_type|system_type|zipcode|
+------------+------+--------+------------+--------------+-----------+-------+
|00000113498f|Toledo|     547|     1470605|        FWM-ID|          H|  43460|
|12bf0065bad0|Toledo|     547|     1492575|        FWM-ID|          H|  43460|
|000000797c1d|Toledo|     547|     1493317|        FWM-ID|          H|  43460|
|000002de361c|Toledo|     547|     1446566|        FWM-ID|          H|  43528|
|0000026360a2|Toledo|     547|     1467668|        FWM-ID|          H|  43528|
|00000071622f|Toledo|     547|     1519598|    

In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql.functions import col, count, dense_rank


#removing records with unknown dma
ref_df = ref_df.filter(col("dma") != "Unknown")


# drop columns that are not needed for the analysis
cols_to_drop_ref = ['household_type', 'system_type']
ref_df = ref_df.drop(*cols_to_drop_ref)


#remove duplicate device_id
ref_df = ref_df.dropDuplicates(['device_id'])

#drop records with multiple dma
windowHousehold = Window.partitionBy("household_id").orderBy("dma")
ref_df = ref_df.withColumn("dma_count", dense_rank().over(windowHousehold))
ref_df = ref_df.filter(f.col("dma_count") == 1).drop("dma_count")

#add column for number of household's devices
window_by_household_id = Window.partitionBy("household_id")
ref_df = ref_df.withColumn("num_of_devices", f.size(f.collect_set("device_id").over(window_by_household_id)))



In [0]:

#drop records with multiple zipcode

windowHousehold = Window.partitionBy("household_id")

# Count distinct zipcodes per household
ref_df = ref_df.withColumn("zipcode_count", f.approx_count_distinct("zipcode").over(windowHousehold))

# Keep only records from households with one unique zipcode
ref_df = ref_df.filter(f.col("zipcode_count") == 1).drop("zipcode_count")

#not neccessary as we are not using zipcode for analysis
ref_df = ref_df.drop("zipcode")




In [0]:
#remove records with houshold_id not in demo_df
all_households = demo_df.select("household_id").distinct()
ref_df = ref_df.join(all_households, "household_id", "inner")


In [0]:
#remove records with device_id that appear in viewing but not ref_df
all_devices = ref_df.select("device_id").distinct()
viewing10m_df = viewing10m_df.join(all_devices, "device_id", "inner")


Reference data after changes

In [0]:
ref_df.printSchema()
ref_df.show(10)

root
 |-- household_id: long (nullable = true)
 |-- device_id: string (nullable = true)
 |-- dma: string (nullable = true)
 |-- dma_code: long (nullable = true)
 |-- num_of_devices: integer (nullable = false)

+------------+------------+--------------------+--------+--------------+
|household_id|   device_id|                 dma|dma_code|num_of_devices|
+------------+------------+--------------------+--------+--------------+
|          26|0014f8ba5982|        Philadelphia|     504|             2|
|          26|001ac32a3365|        Philadelphia|     504|             2|
|         126|001ac32f2530|            New York|     501|             2|
|         126|001bd74a0d05|            New York|     501|             2|
|         130|44e08edd7bf8|            New York|     501|             2|
|         130|44e08ed47aed|            New York|     501|             2|
|         228|0021bea6d3a7|Wilkes Barre-Scra...|     577|             4|
|         228|0021be7b40f1|Wilkes Barre-Scra...|     577|   

# Condition of malicius records


#cond 1

In [0]:
#cond 1
from pyspark.sql.window import Window
from pyspark.sql import functions as f
from pyspark.sql.functions import col, count, lit

# Define the window specification by device_id
window_by_device_id = Window.partitionBy("device_id")

# Calculate the total views per device
device_avg = viewing10m_df \
    .withColumn("total_views", f.count("*").over(window_by_device_id)) \
    .withColumn("unique_dates", f.size(f.collect_set("event_date").over(window_by_device_id))).withColumn("avg", col("total_views")/col("unique_dates")).filter(col("avg") > 5).select("device_id")

prog_cond_1 = viewing10m_df.join(device_avg, "device_id", "inner").select("prog_code").distinct().withColumn("condition",  lit(1))


records_cond_1 = daily_prog_df.join(prog_cond_1, "prog_code", "inner")
                                                




#cond 2

In [0]:
from pyspark.sql.functions import col, lower, when, lit

# cond 2

device_cond_2 = ref_df.withColumn("with_z", when(lower(col("dma")).contains('z'), 1).otherwise(0)).filter(col("with_z") == 1).select("device_id").distinct()
prog_cond_2 = device_cond_2.join(viewing10m_df, "device_id", "inner").select("prog_code").distinct().withColumn("condition", lit(2))

records_cond_2 = daily_prog_df.join(prog_cond_2, "prog_code", "inner")



#cond 3

In [0]:
#cond 3

households_3 = demo_df.filter((demo_df.num_adults < 3 ) & (demo_df.net_worth > 8)).select("household_id")
device_cond_3 = ref_df.join(households_3, "household_id", "inner").select("device_id").distinct()

prog_cond_3 = viewing10m_df.join(device_cond_3, "device_id", "inner").select("prog_code").distinct().withColumn("condition",  lit(3))


records_cond_3 = daily_prog_df.join(prog_cond_3, "prog_code", "inner")



#cond 4

In [0]:
#cond 4
from pyspark.sql.functions import split, col, when
from pyspark.sql.functions import date_format, to_date

prog_cond_4 = daily_prog_df.drop("duration", "genre", "title") \
    .filter(col("day").isin(["Fri", "Sat"])) \
    .withColumn("friday", when(col("day") == "Fri", 1).otherwise(0)) \
    .withColumn("saturday", when(col("day") == "Sat", 1).otherwise(0)) \
    .withColumn("friday", when((col("friday") == 1) & (col("air_time") >= 180000), 1).otherwise(0)) \
    .withColumn("saturday", when((col("saturday") == 1) & (col("air_time") <= 190000), 1).otherwise(0)) \
    .filter((col("friday") == 1) | (col("saturday") == 1)) \
    .select("prog_code").distinct().withColumn("condition",  lit(4))


records_cond_4 = daily_prog_df.join(prog_cond_4, "prog_code", "inner")



#cond 5

In [0]:
#cond 5 
from pyspark.sql.functions import split, col,  when

household_5 = demo_df.filter(col("household_size") >= 8).select("household_id")

devices_5 = ref_df.join(household_5, "household_id", "inner").select("device_id")

prog_cond_5 = viewing10m_df.join(devices_5, "device_id", "inner").select("prog_code").distinct().withColumn("condition",  lit(5))

records_cond_5 = daily_prog_df.join(prog_cond_5, "prog_code", "inner")



#cond 6

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import col, lit, avg, count


# Define the window specification
window_by_household = Window.partitionBy("household_id")

# Calculate the average income
average_income = demo_df.agg(
    avg("income_int").alias("average_income")
).select("average_income").collect()[0][0]

# Filter households with income less than the average
household_ids_below_avg = demo_df.filter(col("income_int") < average_income) \
    .select("household_id") \
    .distinct() \


# Filter devices based on the household_id list and count condition
devices_6 = ref_df.withColumn("row_count", count("*").over(window_by_household)) \
    .filter(col("row_count") > 3) \
    .join(household_ids_below_avg, "household_id", "inner") \
    .select("device_id") \
    .distinct()

# Join with viewing data and add condition column
prog_cond_6 = viewing10m_df.join(devices_6, "device_id", "inner") \
    .select("prog_code") \
    .distinct() \
    .withColumn("condition", lit(6))

# Join with daily program data
records_cond_6 = daily_prog_df.join(prog_cond_6, "prog_code", "inner")

#cond 7

In [0]:
from pyspark.sql.functions import col, split, explode, lit

# Define your bad genres
bad_genres = ['Hydroplane racing', 'Biathlon', 'Snowmobile', 'Community', 'Agriculture', 'Music']

# Create a DataFrame from the bad_genres list for joining
bad_genres_df = spark.createDataFrame(bad_genres, StringType()).toDF("genre")

#find programs with bad genres
prog_cond_7 = daily_prog_df.withColumn("genre", explode(split(col("genre"), ","))) \
    .dropDuplicates() \
    .join(bad_genres_df, "genre", "inner") \
        .select("prog_code").distinct().withColumn("condition", lit(7))

records_cond_7 = daily_prog_df.join(prog_cond_7, "prog_code", "inner")

Number of records for each condition

In [0]:

# Print the number of records with condition 1
print(f"Number of records that meet condition 1: {records_cond_1.count()}")
# Print the number of records with condition 2
print(f"Number of records that meet condition 2: {records_cond_2.count()}")
# Print the number of records with condition 3
print(f"Number of records that meet condition 3: {records_cond_3.count()}")
# Print the number of records with condition 4
print(f"Number of records that meet condition 4: {records_cond_4.count()}")
# Print the number of records with condition 5
print(f"Number of records that meet condition 5: {records_cond_5.count()}")
# Print the number of records with condition 6
print(f"Number of records that meet condition 6: {records_cond_6.count()}")
# Print the number of records with condition 7
print(f"Number of records that meet condition 7: {records_cond_7.count()}")

Number of records that meet condition 1: 61612
Number of records that meet condition 2: 4234440
Number of records that meet condition 3: 3760167
Number of records that meet condition 4: 6863067
Number of records that meet condition 5: 2764214
Number of records that meet condition 6: 5472340
Number of records that meet condition 7: 831385


In [0]:
print(prog_cond_1.count())
print(prog_cond_2.count())
print(prog_cond_3.count())
print(prog_cond_4.count())
print(prog_cond_5.count())
print(prog_cond_6.count())
print(prog_cond_7.count())

205
130695
65183
182177
34848
121657
20783


#Find Malicious Records 


In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import col,  count


# Define the window specification
window_by_prog = Window.partitionBy("prog_code")

# Union all DataFrames
combined_df = prog_cond_1.union(prog_cond_2).union(prog_cond_3).union(prog_cond_4).union(prog_cond_5).union(prog_cond_6).union(prog_cond_7)

# Group by 'prog_code' and count occurrences
malicious_progs = combined_df.withColumn("num_of_cond", count("*").over(window_by_prog)).filter(col("num_of_cond") >= 4).select("prog_code").distinct()

malicious_record = daily_prog_df.join(malicious_progs, "prog_code", "inner")
top_50_prog_code = malicious_progs.orderBy(col("prog_code").desc()).limit(50)





In [0]:
malicious_record.write.mode("overwrite").parquet("dbfs:/project1_part1_malicious_206775181_206750192_315335315.parquet")

Top 50 malicious prog_code

In [0]:
display(top_50_prog_code)


prog_code
SP003272080000
SP003271470000
SP003271440000
SP003271100000
SP003266610000
SP003258260000
SP003258250000
SP003257780000
SP003257260000
SP003256900000
