In [50]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import xlsxwriter
from itertools import chain
from datetime import datetime, timedelta
import pandas as pd
import glob
import os

import sys
sys.path.append('../utils')
from SharePoint import *
from hlpr import *
from static import *

import warnings
warnings.filterwarnings('ignore')

**Functions**

In [51]:
def latest_search_files(folder_path):
    path = glob.glob(os.path.join(folder_path, '*'))
    dict_ = dict()

    for p in path:
        fullname = os.path.split(p)[-1].split('.')[0]
        cutoff = fullname.lower().find('nat')
        k = fullname[:cutoff-1]

        if k in dict_.keys():
            dict_[k].append(p)
        else:  
            dict_[k] = []
            dict_[k].append(p)

    latest_file_path = [max(dict_[k], key=os.path.getctime) for k in dict_.keys()]
    return latest_file_path


def download_flatfile(app, media):
    # dynamically create media dicts?????
    media_dicts = {'social':{'fb':None,'li':None,'pi':None,'tw':None},
                   'search':None,
                   'digital':None}    
        
#     app = SharePoint('AmericanExpressUSGABM')
    app.list_contents(SHAREPOINT_FLAT_PATH)
        
    # download flatfiles
    for f in app.file_paths:
        app.download_files(DATA_PATH)
        
    # store results in dictionary
    if media=='social':
        for p in media_dicts[media].keys():    
            path = glob.glob(os.path.join(DATA_PATH, f"{p}*"))
            if len(path)>0:
                media_dicts[media][p] = pd.read_csv(path[0])

    else:
        path = glob.glob(os.path.join(DATA_PATH, f"{media}*"))
        media_dicts[media] = pd.read_csv(path[0])

    return media_dicts        

def last_archive_date(app):
    '''
    Get the date of the current flat file.
    '''
    archive_dict = dict()
    for f in app.file_paths:
        try:
            archive_date = os.path.split(f)[-1].split('_')[-1].split('.')[0]
            archive_dict[int(archive_date)] = archive_date
        except:
            continue
    last_archive_date = archive_dict[max(archive_dict, key=int)]
    return last_archive_date

def download_archives(app, last_archive_date_=0):
    # get list of data folders and files
    app.list_contents(SHAREPOINT_DATA_PATH)
    for f in app.folder_paths:
        archive_date = os.path.split(f)[-1]
        try:    
            if int(archive_date) > int(last_archive_date_):
                app.folder_paths, app.file_paths = [], []
                app.list_contents(f)
                if len(app.file_paths)>0:
                    data_subpath = os.path.join(DATA_PATH, archive_date)
                    os.makedirs(data_subpath, exist_ok=True)
                    app.download_files(data_subpath)
        except:
            continue
            
    len_ = len(os.listdir(DATA_PATH))
    if len_==0:
        print('FlatFile is up-to-date')
    else:
        return True
        
def create_flatfile(media, redshift_metrics_filename, last_flatfile=None):
    '''
    Create flat files.
    '''
    print(f"\nCreating {media} flat file")
    if media=='social':
        social_flatfile(redshift_metrics_filename, last_flatfile)
    elif media=='search':
        search_flatfile(redshift_metrics_filename, last_flatfile)
        
    return None



def social_flatfile(redshift_metrics_filename, last_flatfile=None):
    '''
    Create social flat files.
    '''
    col_order = ['archive_date', 'week', 'date', 'platform', 'campaign_name', 'campaign_id', 'adset', 'adset_id', 'variable', 'value']
    
    # params
    pth = glob.glob(os.path.join(ASSETS_PATH, f"*{redshift_metrics_filename}*"))[0]
    df_metrics = pd.read_excel(pth, sheet_name='metric_map')
    
    # store contents
    platform_dict = {'fb': [], 'li':[], 'pi':[], 'tw':[]}
    
    # loop 1: load data to dictionary
    for g in glob.glob(os.path.join(DATA_PATH, '*')):
        for p in platform_dict.keys():
            path = glob.glob(os.path.join(g, f"{p}*"))
            if len(path)>0:
                latest_file_path = max(path, key=os.path.getctime)
                archive_date = int(os.path.split(os.path.split(latest_file_path)[0])[-1])
                results = import_social(p, [latest_file_path], df_metrics) # using hlpr function
                df_pre = results['pre_data']
                df_pre['archive_date'] = archive_date

                platform_dict[p].append(df_pre)

    # given the platform, combine all data
    platform_dict = {k:pd.concat(platform_dict[k], sort=False) for k in platform_dict.keys() if len(platform_dict[k])>0}
    
    time.sleep(2) # intermission
    
    # loop 2: create flat file with new data
    for p in platform_dict.keys():
        try:
            # group dimensions
            df = platform_dict[p].copy()
            group_cols = [c for c in df.columns if c.lower()!='value']
            df_grp = df.groupby(group_cols)['value'].sum().to_frame().reset_index()

            # get latest based on id and archive date
            flatfile_group_cols = media_args[media]['flatfile']['group_columns'][p]
            max_date_map = df_grp.groupby(flatfile_group_cols)['archive_date'].max().to_frame().reset_index()
            max_archive_date = max(max_date_map['archive_date'])
            flatfile = max_date_map.merge(df_grp, how='left')
            
            # rename date field
            date_field = social_param_fields[p]['date']
            flatfile.rename(columns={date_field:'date'}, inplace=True)

            # export
            filename = f"{p}_flatfile_{max_archive_date}.csv"
            
            if not(last_flatfile is None):
                # add previous flatfile
                flatfile_pre = last_flatfile['social'][p].copy()
                flatfile = pd.concat([flatfile_pre, flatfile], sort=False)
                flatfile.drop_duplicates(inplace=True)

            flatfile = flatfile[col_order]
            flatfile.to_csv(os.path.join(OUTPUTS_PATH, filename), index=False)
        
        except TypeError as e:
            print(f"        Skipping {p} due to TypeError: {e}")

    return None

def search_flatfile(redshift_metrics_filename, last_flatfile=None):
    '''
    Create search flatfile.
    '''
    pth = glob.glob(os.path.join(ASSETS_PATH, f"*{redshift_metrics_filename}*"))[0]
    df_metrics = pd.read_excel(pth, sheet_name='sa360_metrics')

    list_of_dfs = []
    for g in glob.glob(os.path.join(DATA_PATH, '*')): # loop in folders
        try:
            archive_date = int(os.path.split(g)[-1])
        except ValueError:
            continue
        path = latest_search_files(g)
        for p in path:
            df = pd.read_excel(p)
            id_vars = list(df.columns[:11])
            df_trans = df.melt(id_vars=id_vars)
            df_trans['archive_date'] = archive_date
            list_of_dfs.append(df_trans)

    flatfile_pre = pd.concat(list_of_dfs).reset_index(drop=True)
    flatfile_pre['From'] = pd.to_datetime(flatfile_pre['From'])

    max_date_map = flatfile_pre.groupby(FLATFILE_GROUP_COLS)['archive_date'].max().to_frame().reset_index()
    max_archive_date = max(max_date_map['archive_date'])
    flatfile = max_date_map.merge(flatfile_pre, how='left')
    
    if not(last_flatfile is None):
        # add previous flatfile
        flatfile_pre = last_flatfile['search'].copy()
        flatfile = pd.concat([flatfile_pre, flatfile], sort=False)
        flatfile.drop_duplicates(inplace=True)
        
    filename = f"{media}_flatfile_{max_archive_date}.csv"
    flatfile.to_csv(os.path.join(OUTPUTS_PATH, filename), index=False)

In [52]:
media = 'search'

SHAREPOINT_DATA_PATH = media_args[media]['sharepoint']['data']
SHAREPOINT_MAPPING_PATH = media_args[media]['sharepoint']['mapping']
SHAREPOINT_QA_PATH = media_args[media]['sharepoint']['qa']
SHAREPOINT_FLAT_PATH = media_args[media]['sharepoint']['flat']
REDSHIFT_METRICS_QUERY = media_args[media]['redshift']['all_metrics']
REDSHIFT_METRICS_FILENAME = media_args[media]['redshift']['metric_filename']
REDSHIFT_METRICS_SHEETNAME = media_args[media]['redshift']['metric_sheetname']
REDSHIFT_QA_QUERY = media_args[media]['redshift']['qa_qry']
FLATFILE_GROUP_COLS = media_args[media]['flatfile']['group_columns']
FLATFILE_FINAL_FILENAME = media_args[media]['flatfile']['final_filename']

**Connect to SharePoint**

In [53]:
app = SharePoint('AmericanExpressUSGABM')
# app.list_contents(SHAREPOINT_DATA_PATH)

Connected to SharePoint: https://interpublic.sharepoint.com/sites/AmericanExpressUSGABM


In [54]:
'''
Before loading from the archive, we need to check if 04FlatFiles is empty.
If 04FlatFiles is empty, go to the data folder to download all archive files locally.
'''
app.list_contents(SHAREPOINT_FLAT_PATH) # list folders and files

if len(app.file_paths)==0:
    download_archives(app)

    create_flatfile(media, REDSHIFT_METRICS_FILENAME)
#     # some function to load flat files to SharePoint
    
else:
    last_archive_date_ = last_archive_date(app)
    if download_archives(app, last_archive_date_):
        last_flatfile = download_flatfile(app, media) # dictionary of flat files in 04flatfile
        create_flatfile(media, REDSHIFT_METRICS_FILENAME, last_flatfile)
    
delete_local_data(False)


Creating search flat file

Deleting contents in data folder


In [37]:
# last_flatfile