In [1]:
### import libraries
import argparse
import json
import requests
from collections import defaultdict
import pandas as pd
import numpy as np

In [16]:
### credentials
ACCESS_TOKEN = "INPUT TOKEN HERE"

In [3]:
### functions

def calculate_impressions_midpoint(data):
    """Calculate impressions midpoint"""
    # Convert from a string of characters into an integer
    upper_bound = int(data["impressions"]["upper_bound"])
    lower_bound = int(data["impressions"]["lower_bound"])

    return round((upper_bound + lower_bound) / 2)

def calculate_impressions_by_region(data, impressions):
    """Calculate impressions by regions"""
    # Create a new dict to contain the results
    result = {}

    # Loop through each region and calculate the impressions
    for chunk in data["delivery_by_region"]:
        result[chunk["region"]] = round(impressions * float(chunk["percentage"]))

    return result

def calculate_impressions_by_gender(data, impressions):
    """Calculates impressions by gender"""
    # Create a new dict to contain the results. The categories are pre-populated with 0s.
    results = {"male": 0, "female": 0, "unknown": 0}

    # Loop through demographic data
    for d in data["demographic_distribution"]:
        # Let's break this expression down:
        # 1. result[d["gender"]] looks for the key in the result dict that matches d["gender"]
        # 2. += is a short hand operator for addition and assignment. e.g. foo +=1 is the same
        #    as foo = foo + 1
        # 3. float(d["percentage"]) converts the JSON data into a decimal number (programmers
        #    call those floats because the decimal can move around).
        # 4. round(impressions * float(d["percentage"])) multiplies the percentage by the
        #    impressions and rounds up to the nearest integer
        results[d["gender"]] += round(impressions * float(d["percentage"]))

    return results

def calculate_impressions_by_age(data, impressions):
    """Calculate impressions by age"""
    # Create a new default dict to contain the results. The default dict makes any new element
    # default to a value specified by the user. In our case we're defaulting to an integer 0.
    # int is actually a function that retuns 0 when called with no arguments
    results = defaultdict(int)

    # Loop through the demographic data and add up the impressions
    for d in data["demographic_distribution"]:
        # Let's break this expression down:
        # 1. result[d["age"]] looks for the key in the result dict that matches d["age"]. If
        #    there is no key that matches d["age"] the default dict will make one with the value
        #    of int(), which is always 0.
        # 2. += is a short hand operator for addition and assignment. e.g. foo +=1 is the same
        #    as foo = foo + 1
        # 3. float(d["percentage"]) converts the JSON data into a decimal number (programmers
        #    call those floats because the decimal can move around).
        # 4. round(impressions * float(d["percentage"])) multiplies the percentage by the
        #    impressions and rounds up to the nearest integer
        results[d["age"]] += round(impressions * float(d["percentage"]))

    return results

In [25]:
### list of page IDs
idList = ['106039214814684',
'102281724942742',
'738063612887865',
'591566840920364',
'49560242814',
'101691091213750',
'100801038449520',
'111394533709201',
'107500120800840',
'108203188195224',
'482100658584410',
'292970844058835',
'101242238726088',
'341751646428117',
'396341921119746',
'237209147160346']

In [26]:
"""Entrypoint of the program"""

# Store the paginated data in here
data = []

# create dataframe
df = pd.DataFrame()

# for each item in the list, store their data in the pandas dataframe

# This might work for our request
for i in idList:
    print(i)
    response = requests.get("https://graph.facebook.com/v5.0/ads_archive", params={
    "access_token": ACCESS_TOKEN,
    "ad_type": "POLITICAL_AND_ISSUE_ADS",
    "ad_active_status": "ALL",
    "search_page_ids": i,
    "ad_reached_countries": ["US"],
    "ad_delivery_date_min": "2018-05-24",
    "ad_delivery_date_max": "2022-09-20",
    "fields": "id, ad_delivery_start_time, ad_delivery_stop_time, ad_snapshot_url, bylines, delivery_by_region, demographic_distribution, impressions, publisher_platforms, spend, ad_creative_bodies, ad_creative_link_captions, ad_creative_link_descriptions, ad_creative_link_titles, page_name, page_id"
})
    # Get the json document and pull out the next link and the data
    json = response.json()
    
    try:
        next_link = json['paging']['next']
        data = data + json['data']
    except KeyError:
        continue

    while next_link:
      print('.', end='')
      response = requests.get(next_link)
      json = response.json()
      if 'paging' not in json:
        break
      next_link = json['paging']['next']
      data = data + json['data']
    print('\n')

    # append data to dataframe
    res = pd.json_normalize(data)
    df.append(res)


106039214814684
....

102281724942742
............................................

738063612887865
.................................................................................................................

591566840920364
............................................

49560242814
.......................................................................................

101691091213750
.......................................................................................

100801038449520
...................................................

111394533709201
.

107500120800840
...

108203188195224
......

482100658584410
.......................................

292970844058835
.

101242238726088
........

341751646428117
..

396341921119746
......

237209147160346
..................



### Clean & Export Data

### For Summary Table

In [27]:
# check NaNs:
# res['ad_delivery_stop_time'][res['ad_delivery_stop_time'].isna()]

# fill NaN with '999'
res['ad_delivery_stop_time'] = res['ad_delivery_stop_time'].fillna('999')

In [28]:
# create 'ad start month' and 'ad stop month' column
res['ad_start_month'] = res['ad_delivery_start_time'].str.split('-').str[1].astype(int)
res['ad_stop_month'] = res['ad_delivery_stop_time'].apply(lambda x: x.split('-')[1] if x != '999' else x).astype(int)

In [29]:
# convert string to integer
res['spend.lower_bound'] = res['spend.lower_bound'].astype('int')
res['spend.upper_bound'] = res['spend.upper_bound'].astype('int').round(-2)

In [30]:
# title case for bylines column
res['bylines'] = res['bylines'].str.title()

In [31]:
# view data
res[['page_name', 'id', 'spend.lower_bound', 'spend.upper_bound']].groupby('page_name').agg({'spend.lower_bound': 'sum', 'spend.upper_bound': 'sum'}).reset_index().rename(columns={'page_name':'name', 'spend.lower_bound': 'lowerAmount', 'spend.upper_bound':'upperAmount'})

Unnamed: 0,name,lowerAmount,upperAmount
0,Affordable Energy Coalition,32400,48000
1,Affordable Energy for New Jersey,46300,157400
2,Alliance for Michigan Power,151900,450100
3,Californians for Balanced Energy Solutions,12500,19000
4,Citizens Energizing Michigan's Economy,231100,372100
5,Consumer Energy Alliance,571500,902100
6,Jobkeeper Alliance,1500,2000
7,Natural Allies for a Clean Energy Future,493700,703500
8,New Yorkers For Affordable Energy,7200,8500
9,Partnership for Energy Progress,28300,40600


In [32]:
resjson = res[['page_name', 'id', 'spend.lower_bound', 'spend.upper_bound']].groupby('page_name').agg({'id':'count', 'spend.lower_bound': 'sum', 'spend.upper_bound': 'sum'}).reset_index().rename(columns={'page_name':'name', 'id':'ads', 'spend.lower_bound': 'lowerAmount', 'spend.upper_bound':'upperAmount'}).sort_values(by="upperAmount", ascending=False).reset_index(drop=True)

In [34]:
# export data
resjson.to_json('../data/summary.json', orient = 'records')

### For Individual Tables

In [35]:
# unpack the dataset to access region data
result = pd.concat([res.explode('delivery_by_region').drop(['delivery_by_region'], axis=1),
           res.explode('delivery_by_region')['delivery_by_region'].apply(pd.Series)],
          axis=1)

In [36]:
# convert to float type 
result['percentage'] = result['percentage'].astype(float)

In [37]:
# calculate lower & upper amount spent per region
result['lowerAmount'] = result['percentage'] * result['spend.lower_bound']
result['upperAmount'] = result['percentage'] * result['spend.upper_bound']

In [38]:
# view data
result[['page_name', 'region', 'lowerAmount', 'upperAmount']].groupby(['page_name', 'region']).sum(['lowerAmount', 'upperAmount']).reset_index().rename(columns={'page_name':'name'})

Unnamed: 0,name,region,lowerAmount,upperAmount
0,Affordable Energy Coalition,British Columbia,0.0940,0.1410
1,Affordable Energy Coalition,California,0.1195,0.1337
2,Affordable Energy Coalition,Idaho,38.5210,56.2519
3,Affordable Energy Coalition,Indiana,0.0700,0.1050
4,Affordable Energy Coalition,Washington,32361.1973,47743.3708
...,...,...,...,...
526,We Stand For Energy,Washington,416.9969,976.9276
527,We Stand For Energy,"Washington, District of Columbia",761.9531,1616.7526
528,We Stand For Energy,West Virginia,409.7205,824.7951
529,We Stand For Energy,Wisconsin,350.5035,758.5885


In [39]:
# export data
result[['page_name', 'region', 'lowerAmount', 'upperAmount']].groupby(['page_name', 'region']).sum(['lowerAmount', 'upperAmount']).reset_index().rename(columns={'page_name':'name'}).to_json('../data/individual.json', orient = 'records')