In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import xlsxwriter
from itertools import chain
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import glob
import os
import re

import sys
sys.path.append('../utils')
from SharePoint import *
from hlpr import *
from static import *
from format_qa import *
from qa_by_media import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
media = 'social'

SHAREPOINT_DATA_PATH = media_args[media]['sharepoint']['data']
SHAREPOINT_MAPPING_PATH = media_args[media]['sharepoint']['mapping']
# SHAREPOINT_QA_PATH = media_args[media]['sharepoint']['qa']
# SHAREPOINT_FLAT_PATH = media_args[media]['sharepoint']['flat']
# REDSHIFT_METRICS_QUERY = media_args[media]['redshift']['all_metrics']
REDSHIFT_METRICS_FILENAME = media_args[media]['redshift']['metric_filename']
REDSHIFT_METRICS_SHEETNAME = media_args[media]['redshift']['metric_sheetname']
REDSHIFT_QA_QUERY = media_args[media]['redshift']['qa_qry']

**Connect to SharePoint**

In [3]:
app = SharePoint('AmericanExpressUSGABM')

# download data
app.list_contents(SHAREPOINT_DATA_PATH)
# app.download_files(DATA_PATH)
# app.archive_files()

# download floodlight mapping
# app.list_contents(SHAREPOINT_MAPPING_PATH)
# app.download_files(ASSETS_PATH)

connected to: https://interpublic.sharepoint.com/sites/AmericanExpressUSGABM

listing files from: Measurement%20%20Analytics%20Folder/GABM/Global%20Analytics/DigitalQA/Social/01Sandbox/National



In [4]:
app.file_paths

[]

**Import Raw Data**

In [5]:
def social_qa(redshift_metrics_filename, redshift_qa_query):
    print('starting qa\n')
    time.sleep(1)

# import metrics
# ========================================================================================================================
    pth = glob.glob(os.path.join(ASSETS_PATH, f"*{redshift_metrics_filename}*"))[0]
    df_metrics = pd.read_excel(pth, sheet_name='metric_map')

# import and qa loop
# ========================================================================================================================
    platforms = ['fb', 'li', 'pi', 'tw']
    social_dict = dict()

    for p in platforms:
        pth = glob.glob(os.path.join(DATA_PATH, f"{p}*")) # starts with platform abbreviation

        if len(pth) == 0:
            continue

        # import data
        social_dict[p] = import_social(p, pth, df_metrics)

        # query database
        qry_txt = get_qry_text(redshift_qa_query)
        qry_txt = qry_txt.replace('load_metrics', social_dict[p]['params']['metrics']) \
                         .replace('load_campaign_name', social_dict[p]['params']['campaign']) \
                         .replace('load_start_date', social_dict[p]['params']['start']) \
                         .replace('load_end_date', social_dict[p]['params']['end']) \
                         .replace('load_adset_id', social_dict[p]['params']['adset'])

        if p == 'li':
            qry_txt = qry_txt.replace('AND universal_adset_id IN', 'AND universal_adset IN')

        print(f"starting query for: {p}")
        start_time = time.time()
        df_qry_raw = run_qry(qry_txt)
        print(f"query completed: {time.time() - start_time}")

        social_dict[p]['qry'] = qry_txt
        social_dict[p]['qry_results'] = df_qry_raw

        # combine to create qa file
        social_dict[p]['qa_results'] = create_social_qa(p, social_dict[p]['pro_data'], social_dict[p]['qry_results'], df_metrics)

    return social_dict

In [6]:
qa_results = social_qa(REDSHIFT_METRICS_FILENAME, REDSHIFT_QA_QUERY)

starting qa

starting query for: tw
query completed: 50.12732124328613


In [7]:
qa_results.keys()

dict_keys(['tw'])

In [8]:
qa_results['tw'].keys()

dict_keys(['pre_data', 'params', 'pro_data', 'qry', 'qry_results', 'qa_results'])

In [12]:
qa_results['tw']['qa_results'].keys()

dict_keys(['campaign', 'adset', 'week'])

In [13]:
qa_results['tw']['qa_results']['campaign']

Unnamed: 0,campaign_name,Link clicks_platform,Link clicks_redshift,Link clicks_%_diff,Spend_platform,Spend_redshift,Spend_%_diff,Impressions_platform,Impressions_redshift,Impressions_%_diff
0,21_Q1_USA_GA_TW_LINK_Dine Small - Takeout Tuesday,1555.0,1555.0,0.0,9360.0,9360.0,0.0,785223.0,785223.0,0.0


### Create Social QA

In [53]:
qa_results['tw']['qry_results']['universal_adset'].unique()

array(['In Feed_na_Dine Small_A25-54_English_Taco Groundhog Day-VID-:15-Groundhog Day Unbranded_294385184',
       'In Feed_na_Dine Small_A25-54_English_Chicken Groundhog Day-VID-:15-Groundhog Day Unbranded_294385184'],
      dtype=object)

In [17]:
# params
pth = glob.glob(os.path.join(ASSETS_PATH, f"*{REDSHIFT_METRICS_FILENAME}*"))[0]
df_metrics = pd.read_excel(pth, sheet_name='metric_map')

platform = 'tw'
params = [platform, qa_results[platform]['pro_data'], qa_results[platform]['qry_results'], df_metrics]

In [44]:
def create_social_qa(platform, df_ui, df_rs, df_metrics):
    '''
    Combine platform and Redshift data to calculate diff across metrics.
        Parameters:
            platform (str): abbreviated platform name: 'fb', 'li', 'pi', 'tw'
            df_ui (df): platform dataframe
            df_rs (df): redshift dataframe
            df_metrics (df): platform metrics and redshift equivalent
        Returns:
            pvts_dict (dict): dictionary of metric differences
    '''

    pvts_dict = dict()

    # rename redshift columns to align with platform data
    rs_rename_cols = {
        'universal_campaign':'campaign_name',
        'universal_adset': 'adset',
        'universal_campaign_id': 'campaign_id',
        'universal_adset_id': 'adset_id',
        'platform_metric': 'variable'
                    }
    # add platform to redshift dataframe
    df_rs['platform'] = platform

    # rename columns for mapping
    df_metrics_fltr = df_metrics[df_metrics['platform']==platform][['platform_metric', 'redshift_universal_metric']].drop_duplicates()
    df_metrics_fltr.rename(columns={'redshift_universal_metric':'universal_metric'}, inplace=True)

    # add the twitter conversions to the filter
    if platform=='tw':
        pth = glob.glob(os.path.join(ASSETS_PATH, "*Social_Metrics*"))[0]
        df_metrics = pd.read_excel(pth, sheet_name='tw_conversions')
        floodlight = list(df_metrics['floodlight_name']) # filter dcm data by these floodlights

        # create a dataframe to stack the twitter conversions
        df_tw_floodlights = pd.concat([pd.DataFrame(floodlight), pd.DataFrame(floodlight)], axis=1)
        df_tw_floodlights.columns = ['platform_metric', 'universal_metric']

        df_metrics_fltr = pd.concat([df_metrics_fltr,df_tw_floodlights])
            
    # map variables to platform name
    df_rs = df_rs.merge(df_metrics_fltr, how='left')
    df_rs.rename(columns=rs_rename_cols, inplace=True)
    
    # group data
    group_fields = list(df_ui.columns[:-1])
    group_fields = [f for f in group_fields if not('_id' in f.lower())] if platform == 'li' else group_fields # remove id from li qa
    df_rs = df_rs.groupby(group_fields)['value'].sum().to_frame().reset_index()
    df_ui = df_ui.groupby(group_fields)['value'].sum().to_frame().reset_index()
    
    # add source
    df_rs['source'] = 'redshift'
    df_ui['source'] = 'platform'

    # combine rs and ui for qa
    df_qa = pd.concat([df_rs, df_ui])
    df_qa['value'] = np.round(df_qa['value'])
    

    # set up views
    view_args = {
        'campaign': {
            'index':['campaign_name'],
            'dim_cutoff': 1
                    },
        'adset': {
            'index':['campaign_name', 'adset'],
            'dim_cutoff': 2
                 },
        'week': {
            'index':['campaign_name', 'adset', 'week'],
            'dim_cutoff': 3
                 }
                }

    for k in view_args.keys():
        df_pvt = df_qa.pivot_table(index=view_args[k]['index'], columns=['variable', 'source'], values='value', aggfunc='sum').reset_index()

        list_of_metric_diff = []
        metrics = df_pvt.columns.levels[0][:list(df_pvt.columns.levels[0]).index(view_args[k]['index'][-1])]

        # move core metrics to the beginning
        core_metrics = social_param_fields[platform]['core_metrics']
        metrics = core_metrics + [m for m in metrics if not(m in core_metrics)]

        for m in metrics:
            df_temp = df_pvt.copy()
            df_temp = df_temp[m]
            df_temp.columns = [f"{m}_{c}" for c in df_temp.columns]

            # fill na based on condition
            if len(df_temp.columns) < 2:
                continue
            col0 = df_temp.columns[0]
            col1 = df_temp.columns[1]
            df_temp[col0] = df_temp.apply(lambda x: 0 if np.isnan(x[col0]) and not(np.isnan(x[col1])) else x[col0], axis=1)
            df_temp[col1] = df_temp.apply(lambda x: 0 if np.isnan(x[col1]) and not(np.isnan(x[col0])) else x[col1], axis=1)

            df_temp[f"{m}_%_diff"] = (df_temp.iloc[:,0]/df_temp.iloc[:,1])-1
            df_temp[f"{m}_%_diff"] = df_temp[f"{m}_%_diff"].apply(lambda x: 1 if x == float('inf') else x)

            list_of_metric_diff.append(df_temp)

        metric_diffs = pd.concat(list_of_metric_diff, axis=1)
        df_pvt_qa = pd.concat([df_pvt.iloc[:, :view_args[k]['dim_cutoff']], metric_diffs], axis=1)
        df_pvt_qa.columns = [c[0] if type(c) is tuple else c for c in df_pvt_qa.columns]

        pvts_dict[k] = df_pvt_qa

    return pvts_dict


In [45]:
results = create_social_qa(params[0], params[1], params[2], params[3])

In [46]:
results.keys()

dict_keys(['campaign', 'adset', 'week'])

In [50]:
results['campaign']

Unnamed: 0,campaign_name,Link clicks_platform,Link clicks_redshift,Link clicks_%_diff,Spend_platform,Spend_redshift,Spend_%_diff,Impressions_platform,Impressions_redshift,Impressions_%_diff,Resy : Resy Takeout: Click-through Conversion Events + Cross-Environment_platform,Resy : Resy Takeout: Click-through Conversion Events + Cross-Environment_redshift,Resy : Resy Takeout: Click-through Conversion Events + Cross-Environment_%_diff,Resy : Resy Takeout: View-through Conversion Events + Cross-Environment_platform,Resy : Resy Takeout: View-through Conversion Events + Cross-Environment_redshift,Resy : Resy Takeout: View-through Conversion Events + Cross-Environment_%_diff
0,21_Q1_USA_GA_TW_LINK_Dine Small - Takeout Tuesday,1555.0,1555.0,0.0,9360.0,9360.0,0.0,785223.0,785223.0,0.0,0.0,0.0,,0.0,0.0,
