In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import xlsxwriter
from itertools import chain
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import glob
import os
import re

import sys
sys.path.append('../utils')
from SharePoint import *
from hlpr import *
from static import *
from format_qa import *
from qa_by_media import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
media = 'digital'

SHAREPOINT_DATA_PATH = media_args[media]['sharepoint']['data']
SHAREPOINT_MAPPING_PATH = media_args[media]['sharepoint']['mapping']
SHAREPOINT_QA_PATH = media_args[media]['sharepoint']['qa']
SHAREPOINT_FLAT_PATH = media_args[media]['sharepoint']['flat']
REDSHIFT_METRICS_QUERY = media_args[media]['redshift']['all_metrics']
REDSHIFT_METRICS_FILENAME = media_args[media]['redshift']['metric_filename']
REDSHIFT_METRICS_SHEETNAME = media_args[media]['redshift']['metric_sheetname']
REDSHIFT_QA_QUERY = media_args[media]['redshift']['qa_qry']

**Connect to SharePoint**

In [3]:
app = SharePoint('AmericanExpressUSGABM')

# download data
app.list_contents(SHAREPOINT_DATA_PATH)
app.download_files(DATA_PATH)
app.archive_files()

# download floodlight mapping
app.list_contents(SHAREPOINT_MAPPING_PATH)
app.download_files(ASSETS_PATH)

connected to: https://interpublic.sharepoint.com/sites/AmericanExpressUSGABM

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Digital/01Sandbox/DCMPull/DCMReport

downloading data to: C:\Users\carmelo.urena\Documents\Main\OneDrive_BAK\Amex\DigitalQA\SharePoint\sharepoint_api\data

archiving data in SharePoint

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Digital/02Mapping

downloading data to: C:\Users\carmelo.urena\Documents\Main\OneDrive_BAK\Amex\DigitalQA\SharePoint\sharepoint_api\assets



**Import Metrics**

In [4]:
# import metrics
pth = glob.glob(os.path.join(ASSETS_PATH, f"*{REDSHIFT_METRICS_FILENAME}*"))[0]
df_metrics = pd.read_excel(pth, sheet_name='ConvMap')

campaign_ids_list = [str(c) for c in df_metrics['CampaignID'].unique()]
activity_list = [str(a) for a in df_metrics['ActivityID'].dropna().unique()]
core_metrics = ['Impressions', 'Clicks', 'Media Cost', 'Video Plays', 'Video Views', 'Video Completions']

**Import Raw Data**

In [5]:
# import digital data
list_of_dfs = []

files = glob.glob(os.path.join(DATA_PATH, '*.csv'))
for f in files:
    df = pd.read_csv(f, skiprows=10).iloc[:-1, :]
    df = df[df['Campaign ID'].isin(campaign_ids_list)] # filter campaigns
    df['Date'] = pd.to_datetime(df['Date'])
    
# core metrics
    df_core = df[df['Activity ID']=='(not set)'].iloc[:, :-2]
    
    id_vars = [c for c in df_core.columns if not(c in core_metrics)]
    df_core = df_core.melt(id_vars=id_vars)
    df_core['metric'] = df_core['variable'] 
    
# activity    
    df_act = df[df['Activity ID'].isin(activity_list)]
    df_act.drop(columns=core_metrics, inplace=True) # remove core metrics

    # create floodlight metrics
    id_vars = df_act.columns[:-2]
    df_act = df_act.melt(id_vars=id_vars)
    if len(activity_list) > 0:
        df_act['metric'] = df_act.apply(lambda x: x['Activity Group'] + ' : ' + x['Activity'] + ': ' + x['variable'], axis=1)
    else:
        df_act['metric'] = None # dummy
        
# combine core and activity
    df_combined = pd.concat([df_core, df_act], sort=False)
    
    # drop activity
    drop_activity = [c for c in df_combined.columns if 'activity' in c.lower()]
    df_combined = df_combined.drop(columns=drop_activity)

    list_of_dfs.append(df_combined)

# combine digital data
df_ui = pd.concat(list_of_dfs)

**Get Parameters for DB**

In [6]:
# params
db_campaign_id = "','".join(list(df_ui['Campaign ID'].map(str).unique()))
db_placement_id = "','".join(list(df_ui['Placement ID'].map(str).unique())) 
db_start_date = datetime.strftime(df_ui['Date'].min(), '%Y-%m-%d')
db_end_date = datetime.strftime(df_ui['Date'].max(), '%Y-%m-%d')
db_metrics = "','".join(list(df_ui['metric'].unique()))

**DB Query**

In [7]:
# get data from redshift
qry_txt = get_qry_text(REDSHIFT_QA_QUERY)
qry_txt = qry_txt.replace('load_metrics', db_metrics) \
                 .replace('load_campaign_id', db_campaign_id) \
                 .replace('load_placement_id', db_placement_id) \
                 .replace('load_start_date', db_start_date) \
                 .replace('load_end_date', db_end_date)

time.sleep(1)
df_qry_raw = run_qry(qry_txt)

In [8]:
df_qry = df_qry_raw.copy()
df_qry_dcm = df_qry[df_qry['source']!='override'] # contains dcm & cadreon
df_qry_ss = df_qry[df_qry['source']=='override']

# df_qry_dcm.drop(columns=['source'], inplace=True)
# df_qry_ss.drop(columns=['source'], inplace=True)

In [9]:
# df_temp = df_qry_dcm[df_qry_dcm['campaign_name']=='USA_NAT_PRO_GA_EN_NON_GAB_21_Q1_Programmatic_Enterprise_Dine Small']
# df_temp = df_temp[df_temp['source']=='dcm']
# df_temp[df_temp['metric']=='Media Cost'].to_excel('dcm_pro.xlsx', index=False)

In [23]:
df_qry_ss

Unnamed: 0,week,campaign_name,campaign_id,placement_name,placement_id,site_name,site_id,source,metric,value
3,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295424116,Nextdoor,6285437,override,Media Cost,2537.2799
80,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295288296,Nextdoor,6285437,override,Video Completions,28865.0
87,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295288296,Nextdoor,6285437,override,Clicks,412.0
105,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295420585,Nextdoor,6285437,override,Clicks,530.9999
181,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295281027,Nextdoor,6285437,override,Clicks,1151.0
184,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295281027,Nextdoor,6285437,override,Video Completions,62752.9999
189,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295288296,Nextdoor,6285437,override,Video Plays,83570.0001
385,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295288296,Nextdoor,6285437,override,Impressions,83577.9999
445,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295284780,Nextdoor,6285437,override,Impressions,362362.0001
482,2021-02-15,USA_NAT_SOC_GA_EN_NON_GAB_21_Q1_Social_Enterpr...,25341126,GABM_GAB_Dine Small_na_PLC_BRD_AWR_REP_NextDoo...,295284780,Nextdoor,6285437,override,Media Cost,5073.0598


**Combined UI and DB**

In [10]:
# align ui column names
ui_col_rename = {
    'Date': 'week',
    'Campaign': 'campaign_name',
    'Campaign ID': 'campaign_id',
    'Placement': 'placement_name',
    'Placement ID': 'placement_id',
    'Site (DCM)': 'site_name',
    'Site ID (DCM)': 'site_id',
#     'variable': 'sa360_col_name',
    'metric': 'metric',
    'value': 'value'
}

# filter cols in UI data
df_ui_fltr = df_ui[ui_col_rename.keys()]
df_ui_fltr.rename(columns=ui_col_rename, inplace=True)

# combine
df_qry_dcm['source'] = 'redshift'
df_ui_fltr['source'] = 'dcm'

df_qa = pd.concat([df_qry_dcm, df_ui_fltr], sort=False) 

In [11]:
# normalize data
id_cols = [c for c in df_qa.columns if '_id' in c]
for c in id_cols:
    df_qa[c] = df_qa[c].map(int)
    
df_qa['week'] = pd.to_datetime(df_qa['week'])
df_qa['week'] = df_qa['week'].apply(lambda x: x - timedelta(days=x.weekday()))

# remove trailing spaces
df_qa['campaign_name'] = df_qa['campaign_name'].str.rstrip()
df_qa['placement_name'] = df_qa['placement_name'].str.rstrip()
df_qa['site_name'] = df_qa['site_name'].str.rstrip()
df_qa['metric'] = df_qa['metric'].str.rstrip()

**QA Pivots**

In [12]:
pvts_dict = dict()

db_params_dict = {'last_updated':datetime.now(), 'start_date': db_start_date, 'end_date': db_end_date, 'sql_qry': qry_txt}
details = pd.DataFrame.from_dict(db_params_dict, orient='index').reset_index()
details.columns = ['variable', 'value']

pvts_dict['details'] = details
pvts_dict['raw_data'] = df_qa

In [13]:
ui_placements = df_qa[df_qa['source']=='dcm'][['campaign_name', 'campaign_id', 'placement_name', 'placement_id']].drop_duplicates()
pvts_dict['placement_details'] = ui_placements

In [14]:
view_args = {
    'campaign': {'index':['campaign_name', 'site_name'],
                   'dim_cutoff': 2},
        
    'placement': {'index':['campaign_name', 'placement_id', 'site_name'], 
                 'dim_cutoff': 3},
    
#     'campaign_site': {'index':['campaign_name', 'site_name'],
#              'dim_cutoff': 2}, 
    
    'week': {'index':['campaign_name', 'week', 'site_name'],
             'dim_cutoff': 3}
}

In [15]:
# df_qa['metric'].unique()

In [16]:
digital_param_fieds = {
    'dcm': {
        'core_metrics': ['Clicks', 'Media Cost', 'Impressions', 'Video Plays', 'Video Views', 'Video Completions']
    }
}

In [17]:
for k in view_args.keys():
    df_pvt =  df_qa.pivot_table(index=view_args[k]['index'], columns=['metric', 'source'], values='value', aggfunc='sum').reset_index()
    
    list_of_metric_diff = []
    metrics = df_pvt.columns.levels[0][:list(df_pvt.columns.levels[0]).index(view_args[k]['index'][-1])]
    
    # move core metrics to the beginning
    core_metrics = digital_param_fieds['dcm']['core_metrics']
    metrics = core_metrics + [m for m in metrics if not(m in core_metrics)]
    
    for m in metrics:
        df_temp = df_pvt.copy()
        df_temp = df_temp[m]
        df_temp.columns = [f"{m}_{c}" for c in df_temp.columns]
        
        # fill na based on condition
        if len(df_temp.columns) < 2:
            continue
        col0 = df_temp.columns[0]
        col1 = df_temp.columns[1]
        df_temp[col0] = df_temp.apply(lambda x: 0 if np.isnan(x[col0]) and not(np.isnan(x[col1])) else x[col0], axis=1)
        df_temp[col1] = df_temp.apply(lambda x: 0 if np.isnan(x[col1]) and not(np.isnan(x[col0])) else x[col1], axis=1)
        
        df_temp[f"{m}_%_diff"] = (df_temp.iloc[:,1]/df_temp.iloc[:,0])-1
        df_temp[f"{m}_%_diff"] = df_temp[f"{m}_%_diff"].apply(lambda x: 1 if x == float('inf') else x)
        
        list_of_metric_diff.append(df_temp)
        
    metric_diffs = pd.concat(list_of_metric_diff, axis=1)
    df_pvt_qa = pd.concat([df_pvt.iloc[:, :view_args[k]['dim_cutoff']], metric_diffs], axis=1)
    df_pvt_qa.columns = [c[0] if type(c) is tuple else c for c in df_pvt_qa.columns]
    
    pvts_dict[k] = df_pvt_qa
    

In [18]:
pvts_dict.keys()

dict_keys(['details', 'raw_data', 'placement_details', 'campaign', 'placement', 'week'])

**Export**

In [19]:
qa_filename = f"QA_Digital_{app.dt}.xlsx"
writer = pd.ExcelWriter(os.path.join(OUTPUTS_PATH, qa_filename), engine='xlsxwriter')

for k in pvts_dict.keys():
    pvts_dict[k].to_excel(writer, sheet_name=f"{k}", index=False)
        
writer.save()

**Format**

In [20]:
format_qa(media, os.path.join(OUTPUTS_PATH, qa_filename))