In [21]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import xlsxwriter
from itertools import chain
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import glob
import os
import re

import sys
sys.path.append('../utils')
from SharePoint import *
from hlpr import *
from static import *
from format_qa import *
from qa_by_media import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
media = 'digital'

SHAREPOINT_DATA_PATH = media_args[media]['sharepoint']['data']
SHAREPOINT_MAPPING_PATH = media_args[media]['sharepoint']['mapping']
SHAREPOINT_QA_PATH = media_args[media]['sharepoint']['qa']
SHAREPOINT_FLAT_PATH = media_args[media]['sharepoint']['flat']
REDSHIFT_METRICS_QUERY = media_args[media]['redshift']['all_metrics']
REDSHIFT_METRICS_FILENAME = media_args[media]['redshift']['metric_filename']
REDSHIFT_METRICS_SHEETNAME = media_args[media]['redshift']['metric_sheetname']
REDSHIFT_QA_QUERY = media_args[media]['redshift']['qa_qry']

**Connect to SharePoint**

In [12]:
app = SharePoint('AmericanExpressUSGABM')

# download data
app.list_contents(SHAREPOINT_DATA_PATH)
# app.download_files(DATA_PATH)
# app.archive_files()

ancillary_data(app, media)

# download floodlight mapping
# app.list_contents(SHAREPOINT_MAPPING_PATH)
# app.download_files(ASSETS_PATH)

connected to: https://interpublic.sharepoint.com/sites/AmericanExpressUSGABM

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Digital/01Sandbox/DCMPull/DCMReport

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Digital/01Sandbox/SiteServed

downloading data to: C:\Users\carmelo.urena\Documents\Main\OneDrive_BAK\Amex\DigitalQA\SharePoint\sharepoint_api\data



**Import Metrics**

In [15]:
# import metrics
pth = glob.glob(os.path.join(ASSETS_PATH, f"*{REDSHIFT_METRICS_FILENAME}*"))[0]
df_metrics = pd.read_excel(pth, sheet_name='ConvMap')

campaign_name_list = [c for c in df_metrics['Campaign'].unique()] 
campaign_ids_list = [str(c) for c in df_metrics['CampaignID'].unique()]
activity_list = [str(a) for a in df_metrics['ActivityID'].dropna().unique()]
core_metrics = ['Impressions', 'Clicks', 'Media Cost', 'Video Plays', 'Video Views', 'Video Completions']

**Import Raw Data**

In [41]:
# import digital data
list_of_dfs = []

files = glob.glob(os.path.join(DATA_PATH, '*.csv'))
for f in files:
    df = pd.read_csv(f, skiprows=10).iloc[:-1, :]
    df = df[df['Campaign ID'].map(str).isin(campaign_ids_list)] # filter campaigns
    df['Date'] = pd.to_datetime(df['Date'])
    
# core metrics
    df_core = df[df['Activity ID']=='(not set)'].iloc[:, :-2]
    id_vars = [c for c in df_core.columns if not(c in core_metrics)]
    df_core = df_core.melt(id_vars=id_vars)
    df_core['metric'] = df_core['variable'] 
    
# activity    
    df_act = df[df['Activity ID'].isin(activity_list)]
    df_act.drop(columns=core_metrics, inplace=True) # remove core metrics

    # create floodlight metrics
    id_vars = df_act.columns[:-2]
    df_act = df_act.melt(id_vars=id_vars)
    if len(activity_list) > 0:
        df_act['metric'] = df_act.apply(lambda x: x['Activity Group'] + ' : ' + x['Activity'] + ': ' + x['variable'], axis=1)
    else:
        df_act['metric'] = None # dummy
        
# combine core and activity
    df_combined = pd.concat([df_core, df_act], sort=False)
    
    # drop activity
    drop_activity = [c for c in df_combined.columns if 'activity' in c.lower()]
    df_combined = df_combined.drop(columns=drop_activity)

    list_of_dfs.append(df_combined)

# combine digital data
df_ui = pd.concat(list_of_dfs)

ValueError: No objects to concatenate

In [37]:
activity_list

['4882628', '10740700', '10728117', '10728120', '10669167']

**Get Parameters for DB**

In [6]:
# params
db_campaign_id = "','".join(list(df_ui['Campaign ID'].map(str).unique()))
db_placement_id = "','".join(list(df_ui['Placement ID'].map(str).unique())) 
db_start_date = datetime.strftime(df_ui['Date'].min(), '%Y-%m-%d')
db_end_date = datetime.strftime(df_ui['Date'].max(), '%Y-%m-%d')
db_metrics = "','".join(list(df_ui['metric'].unique()))

**DB Query**

In [7]:
# get data from redshift
qry_txt = get_qry_text(REDSHIFT_QA_QUERY)
qry_txt = qry_txt.replace('load_metrics', db_metrics) \
                 .replace('load_campaign_id', db_campaign_id) \
                 .replace('load_placement_id', db_placement_id) \
                 .replace('load_start_date', db_start_date) \
                 .replace('load_end_date', db_end_date)

time.sleep(1)
df_qry_raw = run_qry(qry_txt)

In [8]:
df_qry = df_qry_raw.copy()
df_qry_dcm = df_qry[df_qry['source']!='override'] # contains dcm & cadreon
df_qry_ss = df_qry[df_qry['source']=='override']

# df_qry_dcm.drop(columns=['source'], inplace=True)
# df_qry_ss.drop(columns=['source'], inplace=True)

In [9]:
# df_temp = df_qry_dcm[df_qry_dcm['campaign_name']=='USA_NAT_PRO_GA_EN_NON_GAB_21_Q1_Programmatic_Enterprise_Dine Small']
# df_temp = df_temp[df_temp['source']=='dcm']
# df_temp[df_temp['metric']=='Media Cost'].to_excel('dcm_pro.xlsx', index=False)

In [10]:
# df_qry_ss

**Combine SS Templated to DB**

In [11]:
ss_metric_rename = {
    'clicks': 'Clicks',
    'spend': 'Media Cost',
    'impressions': 'Impressions',
    'video plays': 'Video Plays',
    'video completions': 'Video Completions'
}

In [12]:
# process SS templates
files = glob.glob(os.path.join('../data', '*xlsx'))

df_ss_template = process_ss_templates(files)
df_ss_template = df_ss_template[df_ss_template['campaign name'].isin(campaign_name_list)]

# align columns to redshift
df_ss_template['week'] = df_ss_template['date'].apply(lambda x: x - timedelta(days=x.weekday()))
df_ss_template.rename(columns={'publisher (site)':'site_name'}, inplace=True)
df_ss_template.rename(columns={c: c.replace(' ', '_') for c in df_ss_template.columns}, inplace=True)

df_ss_template['metric'] = df_ss_template['metric'].apply(lambda x: ss_metric_rename[x])

# add source
df_ss_template['source'] = 'ui'
df_qry_ss['source'] = 'redshift'

In [13]:
df_qa_ss = pd.concat([df_qry_ss, df_ss_template], sort=True)

In [14]:
df_qa_ss.drop(columns=['date', 'campaign_id', 'site_id'], inplace=True)

In [15]:
df_qa_ss['site_served'] = True

**Combined DCM UI and DB**

In [16]:
# align ui column names
ui_col_rename = {
    'Date': 'week',
    'Campaign': 'campaign_name',
    'Campaign ID': 'campaign_id',
    'Placement': 'placement_name',
    'Placement ID': 'placement_id',
    'Site (DCM)': 'site_name',
    'Site ID (DCM)': 'site_id',
#     'variable': 'sa360_col_name',
    'metric': 'metric',
    'value': 'value'
}

# filter cols in UI data
df_ui_fltr = df_ui[ui_col_rename.keys()]
df_ui_fltr.rename(columns=ui_col_rename, inplace=True)

# combine
df_qry_dcm['source'] = 'redshift'
df_ui_fltr['source'] = 'ui'

df_qa = pd.concat([df_qry_dcm, df_ui_fltr], sort=False) 

In [17]:
# normalize data
id_cols = [c for c in df_qa.columns if '_id' in c]
for c in id_cols:
    df_qa[c] = df_qa[c].map(int)
    
df_qa['week'] = pd.to_datetime(df_qa['week'])
df_qa['week'] = df_qa['week'].apply(lambda x: x - timedelta(days=x.weekday()))

# remove trailing spaces
df_qa['campaign_name'] = df_qa['campaign_name'].str.rstrip()
df_qa['placement_name'] = df_qa['placement_name'].str.rstrip()
df_qa['site_name'] = df_qa['site_name'].str.rstrip()
df_qa['metric'] = df_qa['metric'].str.rstrip()

In [18]:
df_qa['site_served'] = False

**Combine All**

In [19]:
df_qa = pd.concat([df_qa, df_qa_ss])

**QA Pivots**

In [20]:
pvts_dict = dict()

db_params_dict = {'last_updated':datetime.now(), 'start_date': db_start_date, 'end_date': db_end_date, 'sql_qry': qry_txt}
details = pd.DataFrame.from_dict(db_params_dict, orient='index').reset_index()
details.columns = ['variable', 'value']

pvts_dict['details'] = details
pvts_dict['raw_data'] = df_qa

In [21]:
# df_qa

In [22]:
# ui_placements = df_qa[df_qa['source']=='dcm'][['campaign_name', 'campaign_id', 'placement_name', 'placement_id']].drop_duplicates()
# pvts_dict['placement_details'] = ui_placements

In [9]:
# df_qa.head()

In [24]:
view_args = {
    'campaign': {'index':['site_served', 'campaign_name', 'site_name'],
                   'dim_cutoff': 3},
        
    'placement': {'index':['site_served', 'campaign_name', 'placement_id', 'site_name'], 
                 'dim_cutoff': 4},
    
    
    'week': {'index':['site_served', 'campaign_name', 'week', 'site_name'],
             'dim_cutoff': 4}
}

In [25]:
# df_qa['metric'].unique()

In [26]:
digital_param_fieds = {
    'dcm': {
        'core_metrics': ['Clicks', 'Media Cost', 'Impressions', 'Video Plays', 'Video Views', 'Video Completions']
    }
}

In [27]:
for k in view_args.keys():
    df_pvt =  df_qa.pivot_table(index=view_args[k]['index'], columns=['metric', 'source'], values='value', aggfunc='sum').reset_index()
    
    list_of_metric_diff = []
    metrics = df_pvt.columns.levels[0][:list(df_pvt.columns.levels[0]).index(view_args[k]['index'][-1])]
    
    # move core metrics to the beginning
    core_metrics = digital_param_fieds['dcm']['core_metrics']
    metrics = core_metrics + [m for m in metrics if not(m in core_metrics)]
    
    for m in metrics:
        df_temp = df_pvt.copy()
        df_temp = df_temp[m]
        df_temp.columns = [f"{m}_{c}" for c in df_temp.columns]
        
        # fill na based on condition
        if len(df_temp.columns) < 2:
            continue
        col0 = df_temp.columns[0]
        col1 = df_temp.columns[1]
        df_temp[col0] = df_temp.apply(lambda x: 0 if np.isnan(x[col0]) and not(np.isnan(x[col1])) else x[col0], axis=1)
        df_temp[col1] = df_temp.apply(lambda x: 0 if np.isnan(x[col1]) and not(np.isnan(x[col0])) else x[col1], axis=1)
        
        df_temp[f"{m}_%_diff"] = (df_temp.iloc[:,1]/df_temp.iloc[:,0])-1
        df_temp[f"{m}_%_diff"] = df_temp[f"{m}_%_diff"].apply(lambda x: 1 if x == float('inf') else x)
        
        list_of_metric_diff.append(df_temp)
        
    metric_diffs = pd.concat(list_of_metric_diff, axis=1)
    df_pvt_qa = pd.concat([df_pvt.iloc[:, :view_args[k]['dim_cutoff']], metric_diffs], axis=1)
    df_pvt_qa.columns = [c[0] if type(c) is tuple else c for c in df_pvt_qa.columns]
    
    pvts_dict[k] = df_pvt_qa
    

In [28]:
pvts_dict.keys()

dict_keys(['details', 'raw_data', 'campaign', 'placement', 'week'])

**Export**

In [29]:
qa_filename = f"QA_Digital_{app.dt}.xlsx"
writer = pd.ExcelWriter(os.path.join(OUTPUTS_PATH, qa_filename), engine='xlsxwriter')

for k in pvts_dict.keys():
    pvts_dict[k].to_excel(writer, sheet_name=f"{k}", index=False)
        
writer.save()

**Format**

In [30]:
# media = 'digital'
# qa_filename = 'QA_Digital_20210224.xlsx'

In [31]:
format_qa(media, os.path.join(OUTPUTS_PATH, qa_filename))

**Final Archiver**

In [20]:
def archive_data(medias):
    app = SharePoint('AmericanExpressUSGABM')
    
    print(f"\n archiving data\n{'='*50}")
          
    for media in medias:
        SHAREPOINT_DATA_PATH = media_args[media]['sharepoint']['data']
        app.list_contents(SHAREPOINT_DATA_PATH)
        app.download_files(DATA_PATH)
        app.archive_files()
        
        if media == 'digital':
            app.list_contents(media_args[media]['sharepoint']['ancillary_data'])
            app.download_files(DATA_PATH)
            app.archive_files()
          
    delete_local_data(False)
        
    return None

In [21]:
medias = ['search', 'social', 'digital']

In [22]:
archive_data(medias)

connected to: https://interpublic.sharepoint.com/sites/AmericanExpressUSGABM


 archiving data
listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Search/01Sandbox/SA360_Files/Gmail/National

downloading data to: C:\Users\carmelo.urena\Documents\Main\OneDrive_BAK\Amex\DigitalQA\SharePoint\sharepoint_api\data

archiving data in SharePoint

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Social/01Sandbox/National

downloading data to: C:\Users\carmelo.urena\Documents\Main\OneDrive_BAK\Amex\DigitalQA\SharePoint\sharepoint_api\data

archiving data in SharePoint

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Digital/01Sandbox/DCMPull/DCMReport

downloading data to: C:\Users\carmelo.urena\Documents\Main\OneDrive_BAK\Amex\DigitalQA\SharePoint\sharepoint_api\data

archiving data in SharePoint

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics

In [32]:
df_log = pd.read_csv(os.path.join(OUTPUTS_PATH, 'log.csv'))