In [75]:
import pandas as pd
import numpy as np
import chardet #Detect encoding type
import os
import copy
import re

In [76]:
from googleapiclient import discovery, http
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from jira import JIRA

In [77]:
import sys
#Add the ptdraft folder path to the sys.path list
sys.path.append('~⁩/⁨PycharmProjects⁩/Notebooks⁩')
from authentication.CYcreds import google, slack, jira

In [78]:
#Drop columns with missing values
def missing_values(df, percentage):

    columns = df.columns
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})

    missing_drop = list(missing_value_df[missing_value_df.percent_missing>percentage].column_name)
    df = df.drop(missing_drop, axis=1)
    return df


In [79]:
#Convert to datetime
def convert_string_to_date(df_colname):
    return pd.to_datetime(df_colname,format='%Y-%m-%d')
    #return colname.dt.date
   

In [80]:
def search(values, searchFor):
    for k in values:
        for v in values[k]:
            if searchFor in v:
                return k
    return None

In [81]:
ads_fp = r'Data/PathToPurchase/ads.csv'
ads_df = pd.read_csv(ads_fp,sep=',', header=0,parse_dates=[0])
ads_df.head()
ads_df.dtypes
ads_df.shape


(19728, 15)

In [82]:
#List of paid media vendors dictionary

paidmedia_vendors_dict = {
        'Farm Equipment Showcase':['farmequipmentshowcase','farmequipment']
       , 'GPS World':['gpsworld']
       , 'Google' : ['google']
       , 'Netline' :['netline']
       , 'PR':['pr']
       , 'Twitter':['twitter']
       , 'xyHt':['xyht']
       , 'GPS World':['gpsworld']
       , 'Inside Unmanned Systems':['insideunmannedsystems']
       , 'LinkedIn':['linkedin']
       , 'Facebook' : ['facebook']
       , 'eEffective':['eeffective']
       , 'CIO Review' :['cioreview']
       , 'AUVSI':['auvsi']
       , 'Google Keyword':['googlekeyword']
       , 'SAE':['sae']
       , 'PrecisionAG':['precisionag']
       , 'Robotics Tomorrow':['roboticstomorrow']
       , 'AREMA':['arema']
       , 'Interdrone':['interdrone']
       , 'Intergeo':['intergeo']
       , 'Unmanned Systems Technology':['unmannedsystemstechnology']
       , 'IEEE': ['ieee','ieee-manufacturingtechnology', 'ieee-roboticsnews', 'ieee-carsthatthink']
       , 'Rail Group News':['railgroupnews']
       , 'Rail Track Structure' : ['railtrackstructure']
       , 'YahooBing': ['yahoobing','bing','yahoo']
       , 'Targeted Digital' : ['targeteddigital']
       , 'ECN' : ['ecn', 'electroniccomponentnews']
       , 'RTS' : ['rts','RTandS','rtands']
       , 'GDN Ad':['GDNResponsiveAds', 'gdnresponsiveads', 'gdnads','gdnad']
       , 'Wards': ['wards']
       , 'Wards Auto':['wardsauto']
       , 'Bulletin Media':['bulletinmedia']
       , 'Madison Logic' : ['madisonlogic']
       , 'Smart Brief':['smartbrief']
       , 'Instagram':['instagram']
       , 'Auto Tech Council':['autotechcouncil']
       , 'Xponential' : ['xponential']
       , 'Inside GNSS' : ['insidegnss']
       , 'Railway Gazette' : ['RailwayGazette','railwaygazette']
       , 'Automotive Engineering' : ['Automotive Engineering','automotiveengineering','AutomotiveEngineering']
}

paidmedia_vendors_df = pd.DataFrame.from_dict(paidmedia_vendors_dict, orient='index')

def getvendor(df):
    regexp = r"^" + re.escape(df) + r"$" 
    mask = np.column_stack([paidmedia_vendors_df[col].str.contains(regexp, na=False) for col in paidmedia_vendors_df])
    return paidmedia_vendors_df.loc[mask.any(axis=1)].index.values
    

In [83]:
paidmedia_medium_dict = {
    'SEM' : ['cpc','sem']
    ,'Retargeting' : ['retargeting']
    ,'Newsletter' : ['enewsletter','newsletter']
    ,'Email' : ['eblast','email']
    ,'Social' : ['twitter','linkedin','instagram','facebook','social']
    ,'Online Ad' : ['geofence','roadblock','highimpact','digital','digitaledition','onlinead']
    ,'Print Ad': ['print', 'native','content','printad'] 
    ,'Lead Generation' : ['leadgeneration']
    ,'Event' : ['event']   
    ,'PR' : ['pr']   
    ,'Webinar' : ['webinar'] 
    ,'Sponsored Ad' : ['Sponsored Ad', 'sponsoredad','SponsoredAd','sponsored ad'] 
    ,'GDN Ad' : ['gdnresponsiveads','GDN Responsive Ads','GDNResponsiveAds']
}

paidmedia_medium_df = pd.DataFrame.from_dict(paidmedia_medium_dict, orient='index')

def getmedium(df):
    regexp = r"^" + re.escape(df) + r"$" 
    mask = np.column_stack([paidmedia_medium_df[col].str.contains(regexp, na=False) for col in paidmedia_medium_df])
    return paidmedia_medium_df.loc[mask.any(axis=1)].index.values
    

In [84]:
ads_null_columns=ads_df.columns[ads_df.isnull().any()]
ads_df[ads_null_columns].isnull().sum()

#Drop columns in a pandas dataframe based on the 100% of null values
ads_valid_df = ads_df.dropna(thresh = len(ads_df)*.1, axis = 1)

#Date Transformation
ads_valid_df['creative_run_date'] = convert_string_to_date(ads_valid_df['Day of Date'])

#Remove columns leading and trailing white spaces, lower casing all names, and replacing any remaining white spaces with underscores
ads_valid_df.columns = ads_valid_df.columns.str.strip().str.lower().str.replace(' ', '_')




In [85]:
ads_valid_df['mapped_vendor'] = ads_valid_df['vendor'].str.lower().str.strip().str.replace('\s|_','')
ads_valid_df['mapped_vendor'] = ads_valid_df['mapped_vendor'].apply(getvendor).apply(lambda x: ' '.join(x))
ads_valid_df['mapped_vendor'].head()

#Check if lc_vendor is empty
ads_valid_df[ads_valid_df['mapped_vendor'].apply(len) == 0]

ads_valid_df['mapped_medium'] = ads_valid_df['tactic'].str.lower().str.strip().str.replace('\s|_','')
ads_valid_df['mapped_medium'] = ads_valid_df['mapped_medium'].apply(getmedium).apply(lambda x: ' '.join(x))
ads_valid_df['mapped_medium'].unique()

#Check if lc_vendor is empty
ads_valid_df[ads_valid_df['mapped_medium'].apply(len) == 0]

Unnamed: 0,day_of_date,vendor,tactic,placement,creative_name,creative,impressions,clicks,spend,add_to_cart,lead_submission,proceed_to_checkout,whitepaper_download,conversions_adjusted,total_conversions,creative_run_date,mapped_vendor,mapped_medium


In [86]:
#Combine vendor and tactic to create a new column 
ads_valid_df['AdsVendorMedium'] = np.where(ads_valid_df['mapped_vendor'] == ads_valid_df['mapped_medium']
            , ads_valid_df['mapped_vendor'].str.replace(' ','_') 
            , ads_valid_df['mapped_vendor'].str.replace(' ','_') + '_' + ads_valid_df['mapped_medium'].str.replace(' ','_')
)

ads_valid_df['LC_AdsVendorMedium'] = np.where(ads_valid_df['vendor'] == ads_valid_df['tactic']
            , ads_valid_df['vendor'].str.lower().str.replace('\s|_','') 
            , ads_valid_df['vendor'].str.lower().str.replace('\s|_','') + '_' + ads_valid_df['tactic'].str.lower().str.replace('\s|_','')
)

#Replace dollar sign and comma in spend, impressions, clicks columns
replacements = {
   'spend': {r'(\$|,)': ''},
   'impressions' : {r'(,)': ''},
   'clicks' : {r'(,)': ''}
}
ads_valid_df.replace(replacements['spend'], regex=True, inplace=True)
ads_valid_df.replace(replacements['impressions'], regex=True, inplace=True)
ads_valid_df.replace(replacements['clicks'], regex=True, inplace=True)

ads_valid_df = ads_valid_df.astype(dtype= {"impressions":"int64","clicks":"int64","spend":"float64"})

ads_valid_df.columns

Index(['day_of_date', 'vendor', 'tactic', 'placement', 'creative_name',
       'creative', 'impressions', 'clicks', 'spend', 'add_to_cart',
       'lead_submission', 'proceed_to_checkout', 'whitepaper_download',
       'conversions_adjusted', 'total_conversions', 'creative_run_date',
       'mapped_vendor', 'mapped_medium', 'AdsVendorMedium',
       'LC_AdsVendorMedium'],
      dtype='object')

In [87]:
ads_groupby_creative=ads_valid_df.groupby(['creative_run_date', 'AdsVendorMedium','LC_AdsVendorMedium'],as_index = False)['spend','add_to_cart','lead_submission','whitepaper_download','conversions_adjusted','total_conversions','impressions','clicks'].sum()

#get the first day of the month
ads_groupby_creative["creative_run_month"] = ads_groupby_creative['creative_run_date'].values.astype('datetime64[M]')

ads_groupby_creative.head()

Unnamed: 0,creative_run_date,AdsVendorMedium,LC_AdsVendorMedium,spend,add_to_cart,lead_submission,whitepaper_download,conversions_adjusted,total_conversions,impressions,clicks,creative_run_month
0,2018-01-01,Farm_Equipment_Showcase_Print_Ad,farmequipmentshowcase_printad,1997.0,0,0.0,0,0,0,0,0,2018-01-01
1,2018-01-01,Google_SEM,google_sem,283.0,0,1.0,0,1,1,7726,88,2018-01-01
2,2018-01-01,Inside_GNSS_Print_Ad,insidegnss_printad,0.0,0,0.0,0,0,0,0,0,2018-01-01
3,2018-01-01,Railway_Gazette_Online_Ad,railwaygazette_onlinead,0.0,0,0.0,0,0,0,10,0,2018-01-01
4,2018-01-01,Twitter_Social,twitter_social,49.0,0,0.0,0,0,0,5322,32,2018-01-01


In [88]:
#Get the ad by month
ads_conversions_by_month_df = ads_groupby_creative.groupby(['creative_run_month'
                                                            ,'AdsVendorMedium'
                                                            ,'LC_AdsVendorMedium']).agg({
                                                                     'spend': sum
                                                                      , 'add_to_cart': 'sum'
                                                                      , 'lead_submission': 'sum'
                                                                      , 'whitepaper_download' : 'sum'
                                                                      , 'conversions_adjusted' : 'sum'
                                                                      , 'total_conversions' : 'sum'
                                                                      , 'impressions' : 'sum'
                                                                      , 'clicks' : 'sum'
                                                             })
ads_conversions_by_month_df.reset_index(inplace=True)

ads_conversions_by_month_df.columns = [
                                     'Ads Month'
                                      ,'Ads Vendor Medium'
                                      ,'LC Ads Vendor Medium'
                                      ,'Ads Spend'
                                      ,'Add To Cart'
                                      ,'Lead Submission'
                                      ,'Whitepaper Download'
                                      ,'Conversions Adjusted'
                                      ,'Total Conversions'
                                      ,'Impressions'
                                      ,'Clicks'
                                      ]
ads_conversions_by_month_df.head()

Unnamed: 0,Ads Month,Ads Vendor Medium,LC Ads Vendor Medium,Ads Spend,Add To Cart,Lead Submission,Whitepaper Download,Conversions Adjusted,Total Conversions,Impressions,Clicks
0,2018-01-01,Bulletin_Media_Newsletter,bulletinmedia_enewsletter,0.0,0,0.0,0,0,0,152,0
1,2018-01-01,Farm_Equipment_Showcase_Print_Ad,farmequipmentshowcase_printad,1997.0,0,0.0,0,0,0,0,7291
2,2018-01-01,Google_GDN_Ad,google_gdnresponsiveads,1000.0,0,3.0,0,3,3,311582,1004
3,2018-01-01,Google_SEM,google_sem,9728.0,0,41.0,14,52,72,222372,3154
4,2018-01-01,IEEE_Newsletter,ieee_enewsletter,0.0,0,0.0,0,0,0,0,1


In [89]:
ads_conversions_by_month_df.to_csv('Data/PathToPurchase/DataFrameFiles/AdsByMonth.csv', sep=',', encoding='utf-8',index=False )

In [90]:
del ads_valid_df

del ads_groupby_creative

del ads_conversions_by_month_df