In [1]:
import pandas as pd
import warnings

# calling functions from the manual created util file
from util import df_first_look, file_read, sessions_final_df_cleaning

# adding options to visualise all rows and columns in the project
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

# ignoring warnings
warnings.filterwarnings("ignore")

# adding useful magic functions
%load_ext lab_black
%matplotlib inline
%config Completer.use_jedi = False

In [8]:
# Reading all files into the data frames
(
    sessions_final_df,
    audience_df,
    tv_planning_df,
    tv_spots_df,
    tv_visits_match_df,
) = file_read()

### Descriptive Statistics to be Added For Each Table

### Target Channels 

Regarding the assignment rules we need to include a certain list of channels.

● Search engines (SEO + SEA, Brand)

● Direct Type-in Traffic

From the unique channel lists, I have selected these ones to be included in the project.

In [9]:
channel_list = [
    "Direct App",
    "Bing CPC Brand",
    "Direct",
    "Organic Search Brand",
    "Google CPC Brand",
    "Google CPC Non Brand",
    "Organic Search Non Brand",
    "Bing CPC Non Brand",
    "Apple Search Ads",
    "Google Paid App Non Brand",
    "Display Brand",
]
print("Initial sessions_final_df table count:", sessions_final_df.shape[0])
print("")
sessions_final_df = sessions_final_df_cleaning(sessions_final_df, channel_list)
print(
    "Final sessions_final_df table count after channel filtering:",
    sessions_final_df.shape[0],
)

Initial sessions_final_df table count: 1058463

Final sessions_final_df table count after channel filtering: 628032


### TV Planning Date Filtering

In the sessions data set we only focus on 15th of May 2020. Regarding this we need to filter out other days from TV Planning data set.

In [11]:
# getting only 15th of May 2020 data
tv_planning_df = tv_planning_df[
    (tv_planning_df["block_start_time"].dt.strftime("%Y-%m-%d") == "2020-05-15")
].copy()

### Joining TV Planning and TV Spots Data Frames

* TV Planning Data Frame is our main data set to get the advertisement block hours.
* TV Spot Data Frame contains information about each advertisement. 
We need to join these tables and calculate when our advertisements start and end during each show's advertisement block hours.

In [None]:
tv_show_spot_df = pd.merge(tv_planning_df, tv_spots_df, how="left", on="spot_id")

In [None]:
tv_show_spot_df["duration_numeric"] = pd.to_numeric(
    tv_show_spot_df["duration"].str.split(" ", expand=True)[0]
)

In [None]:
tv_show_spot_df["block_position_second"] = pd.to_timedelta(
    tv_show_spot_df["block_position"], unit="s"
)
tv_show_spot_df["ads_start_time"] = (
    tv_show_spot_df["block_start_time"] + tv_show_spot_df["block_position_second"]
)

In [None]:
tv_show_spot_df["duration_numeric_second"] = pd.to_timedelta(
    tv_show_spot_df["duration_numeric"], unit="s"
)
tv_show_spot_df["ads_end_time"] = (
    tv_show_spot_df["ads_start_time"] + tv_show_spot_df["duration_numeric_second"]
)

In [None]:
tv_show_spot_df