In [1]:
import pandas as pd

from util import df_first_look

import warnings

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

%load_ext lab_black

%matplotlib inline
warnings.filterwarnings("ignore")
%config Completer.use_jedi = False

In [2]:
def file_read():
    minute_df = pd.read_csv("case_may_2020_sessions_final.csv")
    tv_data_df = pd.ExcelFile("case_may_2020_TV_data.xlsx")
    audience_df = tv_data_df.parse("Audience", usecols=["tv_show", "reach"])
    tv_planning_df = tv_data_df.parse("TV Planning")
    tv_spots_df = tv_data_df.parse("TV spots")
    tv_visits_match_df = tv_data_df.parse("TV - visits match")

    minute_df.dropna(axis=0, how="all", inplace=True)
    audience_df.dropna(axis=0, how="all", inplace=True)
    tv_planning_df.dropna(axis=0, how="all", inplace=True)
    tv_spots_df.dropna(axis=0, how="all", inplace=True)
    tv_visits_match_df.dropna(axis=0, how="all", inplace=True)

    return minute_df, audience_df, tv_planning_df, tv_spots_df, tv_visits_match_df

In [3]:
minute_df, audience_df, tv_planning_df, tv_spots_df, tv_visits_match_df = file_read()

In [4]:
def minute_df_cleaning():
    minute_df_correct_channels = minute_df[
        minute_df["channel"].isin(
            [
                "Direct App",
                "Bing CPC Brand",
                "Direct",
                "Organic Search Brand",
                "Google CPC Brand",
                "Google CPC Non Brand",
                "Organic Search Non Brand",
                "Bing CPC Non Brand",
                "Apple Search Ads",
                "Google Paid App Non Brand",
                "Display Brand",
            ]
        )
    ].copy()

    del minute_df_correct_channels["Unnamed: 0"]

    return minute_df_correct_channels

In [5]:
minute_df = minute_df_cleaning()

In [6]:
# getting only 15th of May 2020 data
tv_planning_df = tv_planning_df[
    (tv_planning_df["block_start_time"].dt.strftime("%Y-%m-%d") == "2020-05-15")
].copy()

In [7]:
tv_show_spot_df = pd.merge(tv_planning_df, tv_spots_df, how="left", on="spot_id")

In [8]:
tv_show_spot_df["duration_numeric"] = pd.to_numeric(
    tv_show_spot_df["duration"].str.split(" ", expand=True)[0]
)

In [9]:
tv_show_spot_df["block_position_second"] = pd.to_timedelta(
    tv_show_spot_df["block_position"], unit="s"
)
tv_show_spot_df["ads_start_time"] = (
    tv_show_spot_df["block_start_time"] + tv_show_spot_df["block_position_second"]
)

In [10]:
tv_show_spot_df["duration_numeric_second"] = pd.to_timedelta(
    tv_show_spot_df["duration_numeric"], unit="s"
)
tv_show_spot_df["ads_end_time"] = (
    tv_show_spot_df["ads_start_time"] + tv_show_spot_df["duration_numeric_second"]
)

In [11]:
tv_show_spot_df

Unnamed: 0,tvp_id,block_start_time,tv_show,block_size,block_position,spot_id,spot_name,duration,duration_numeric,block_position_second,ads_start_time,duration_numeric_second,ads_end_time
0,10001127236681063068_1553644644,2020-05-15 18:05:00,First Dates - Ein Tisch für zwei,420.0,345.0,4234.0,springsummer_branding,30 secs,30,0 days 00:05:45,2020-05-15 18:10:45,0 days 00:00:30,2020-05-15 18:11:15
1,10001127236681063068_1553644645,2020-05-15 18:10:00,SOKO Kitzbühel,300.0,270.0,6232.0,springsummer_outfitfashionA,15 secs,15,0 days 00:04:30,2020-05-15 18:14:30,0 days 00:00:15,2020-05-15 18:14:45
2,10001127236681063068_1553644646,2020-05-15 19:35:00,Alles was zählt,480.0,60.0,8331.0,springsummer_saleB,7 secs,7,0 days 00:01:00,2020-05-15 19:36:00,0 days 00:00:07,2020-05-15 19:36:07
3,10001127236681063068_15536446461,2020-05-15 19:40:00,Comeback oder weg?,300.0,180.0,7224.0,springsummer_saleA,7 secs,7,0 days 00:03:00,2020-05-15 19:43:00,0 days 00:00:07,2020-05-15 19:43:07
4,10001127236681063068_1553644647,2020-05-15 20:15:00,Jack Reacher,60.0,15.0,6232.0,springsummer_outfitfashionA,15 secs,15,0 days 00:00:15,2020-05-15 20:15:15,0 days 00:00:15,2020-05-15 20:15:30
