**Generate dummy metadata for a file to illustrate the use of the metadata generator function.**

The general steps for generating the metadata is the following:

1. Import the metadata generator function from metadata_generator.
1. Define the data dictionary.
1. Define the metadata dictionary.
1. Generate metadata.

# Import required libraries

In [1]:
import metadata_generator as m_gen
from pyspark.sql import SparkSession
from numpy import nan

# Set spark session
BLOB_NAME = 'salrsuse2tcccmbrtest01'
BLOB_KEY = 'lO9mQDHbdAcYRaurJcSkAyCvXQ/ZMihzUaXyJdcUDk1wgVeowqrn0QgNkW1JB5EVrXae5bmJ/Y2BuAe/NF5h0Q=='
BLOB_CONTAINER = 'media-house-data'

spark = SparkSession.builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.memory.fraction", 0.8) \
    .config("spark.executor.memory", "14g") \
    .config("spark.driver.memory", "12g")\
    .config("spark.sql.shuffle.partitions" , "800") \
    .getOrCreate()

spark.conf.set("fs.azure.account.key.{0}.blob.core.windows.net".format(BLOB_NAME), BLOB_KEY)

# Define data dictionary

In [8]:
description_dictionary = {'account_id': {'dtype': 'string',
  'description': 'Account ID',
  'primary_key': nan,
  'required': 'True'},
 'account_name': {'dtype': 'string',
  'description': nan,
  'primary_key': nan,
  'required': 'True'},
 'actions_post_engagement': {'dtype': 'float',
  'description': "All actions that people take involving the ads while they're running. (From Facebook Marketing API Documentation)",
  'primary_key': nan,
  'required': nan},
 'ad_id': {'dtype': 'string',
  'description': 'Ad ID',
  'primary_key': nan,
  'required': 'True'},
 'ad_name': {'dtype': 'string',
  'description': 'Name of the ad',
  'primary_key': nan,
  'required': 'True'},
 'age': {'dtype': 'string',
  'description': 'Age range',
  'primary_key': nan,
  'required': 'True'},
 'aggregation': {'dtype': 'string',
  'description': 'Type of time period for the data',
  'primary_key': nan,
  'required': 'True'},
 'campaign_name': {'dtype': 'string',
  'description': 'Name of the Campaign',
  'primary_key': nan,
  'required': 'True'},
 'campaign_id': {'dtype': 'string',
  'description': 'ID of the Campaign',
  'primary_key': nan,
  'required': 'True'},
 'date_start': {'dtype': 'date',
  'description': 'Start date of the data',
  'primary_key': nan,
  'required': 'True'},
 'date_stop': {'dtype': 'date',
  'description': 'End date of the data',
  'primary_key': nan,
  'required': 'True'},
 'gender': {'dtype': 'string',
  'description': 'Gender',
  'primary_key': nan,
  'required': 'True'},
 'rundate': {'dtype': 'date',
  'description': 'Date when extraction is excecuted',
  'primary_key': nan,
  'required': 'True'},
 'segment': {'dtype': 'string',
  'description': 'Type of breakdown for the data',
  'primary_key': nan,
  'required': 'True'},
 'spend': {'dtype': 'float',
  'description': 'Spend as reported by the source',
  'primary_key': nan,
  'required': 'True'},
 'cpc': {'dtype': 'float',
  'description': 'Cost per click',
  'primary_key': nan,
  'required': 'True'},
 'date': {'dtype': 'date',
  'description': 'Corresponding date of the data',
  'primary_key': nan,
  'required': 'True'},
 'period': {'dtype': 'string',
  'description': 'Minimum time period of aggregation for the data',
  'primary_key': nan,
  'required': 'True'},
 'platform': {'dtype': 'string',
  'description': 'Name of the social media platform where the Ad runs. This is inferred from the ad name.',
  'primary_key': nan,
  'required': 'True'},
 'brand': {'dtype': 'string',
  'description': 'Brand name inferred from the Account Name',
  'primary_key': nan,
  'required': 'True'},
 'level': {'dtype': 'string',
  'description': 'Object representing the lowest level in a hierarchy for the data',
  'primary_key': nan,
  'required': 'True'},
 'account_name': {'dtype': 'string',
  'description': 'Name of the Ad Account',
  'primary_key': nan,
  'required': 'True'},
 'cpm': {'dtype': 'float',
  'description': 'The average cost for 1,000 impressions. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': nan},
 'cpp': {'dtype': 'float',
  'description': 'The average cost to reach 1,000 people. This metric is estimated. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': nan},
 'ctr': {'dtype': 'float',
  'description': 'The percentage of times people saw your ad and performed a click (all). (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': nan},
 'estimated_ad_recall_rate': {'dtype': 'float',
  'description': 'The rate at which an estimated number of additional people, when asked, would remember seeing your ads within 2 days. (See Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': nan},
 'estimated_ad_recallers': {'dtype': 'float',
  'description': 'An estimate of the number of additional people who may remember seeing your ads, if asked, within 2 days. (See Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': nan},
 'frequency': {'dtype': 'float',
  'description': 'The average number of times each person saw your ad. This metric is estimated. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': 'True'},
 'impressions': {'dtype': 'float',
  'description': 'The number of times your ads were on screen. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': 'True'},
 'objective': {'dtype': 'string',
  'description': 'The objective you selected for your campaign. Your objective reflects the goal you want to achieve with your advertising. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': nan},
 'reach': {'dtype': 'float',
  'description': 'The number of people who saw your ads at least once. Reach is different from impressions, which may include multiple views of your ads by the same people. This metric is estimated. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': 'True'},
 'date_requested': {'dtype': 'string',
  'description': 'Deprecated. See rundate.',
  'primary_key': nan,
  'required': nan},
 'clicks': {'dtype': 'float',
  'description': 'The number of clicks on your ads. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': 'True'},
 'unique_clicks': {'dtype': 'float',
  'description': 'The number of people who performed a click (all). This metric is estimated. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': 'True'},
 'unique_ctr': {'dtype': 'float',
  'description': 'The percentage of people who saw your ad and performed a unique click (all). This metric is estimated. (From Facebook Marketing API Documentation)',
  'primary_key': nan,
  'required': nan},
 'campaign_start_time':{'dtype': 'date',
  'description': 'Start date for the corresponding campaign.',
  'primary_key': nan,
  'required': nan},
 'campaign_stop_time':{'dtype': 'date',
  'description': 'End date for the corresponding campaign.',
  'primary_key': nan,
  'required':nan}
                         }

# Define metadata dictionary

In [9]:
# The name of the data file
title = 'ad_fourweeks_age_gender'

## Metadata we are going to generate is fixed to its corresponding file
file = 'wasbs://{BLOB_CONTAINER}@salrsuse2tcccmbrtest01.blob.core.windows.net/facebook_ads/processed/historic/{DATE}/{title}/parquet'
file = file.format(BLOB_CONTAINER=BLOB_CONTAINER, DATE='20190401', title=title)

############### DEFINE METADATA FIELDS ############################################

metadata_fields = {'dct.accrualMethod':'append',
              'dct.description': 'Facebook Paid ad data by age gender aggregated by four weeks',
              'dct.format': 'parquet',
              'opi.dataDictionaryPath':'path_to_the_data_dictionary specified by user.',
              'dct.temporal': 'Four weeks',
              'dct.title': 'facebook_'+title,
              'dct.accessRights': 'Access only for The Coca Cola Company and OPI Analytics',
              'dct.audience': 'Coca Cola Media House',
              'dct.spatial':'Mexico',
              'opi.mediatorContact':'Alejandra López',
              'dct.mediator': 'alelopez@coca-cola.com',
              'dct.publisher': 'OPI Analytics',
              'opi.publisherContact': 'ko@opianalytics.com',
              'dct.language':'English', 
              'dct.subject': 'Digital Media Marketing', 
              'dct.accrualPeriodicity': 'weekly on Monday', 'dct.created':'yyyy-mm-dd'
                  }

###################################################################################

# Generate metadata files

In [10]:
############## GENERATE THE FILES FOR METADATA AND DATA DICTIONARY #################

metadata, data_dictionary = m_gen.generate_metadata(file, metadata_fields, description_dictionary, spark)

# In case we want to specify a generic value in the description we can fill the missing values.
data_dictionary['description'] = data_dictionary.description.fillna('See Facebook Marketing API Documentation: https://developers.facebook.com/docs/marketing-api/insights/parameters/v3.1')

display(metadata)
print('\n\n')
display(data_dictionary)

Unnamed: 0,term,value
0,dct.accessRights,Access only for The Coca Cola Company and OPI ...
1,dct.accrualMethod,append
2,dct.accrualPeriodicity,weekly on Monday
3,dct.audience,Coca Cola Media House
4,dct.mediator,alelopez@coca-cola.com
5,dct.created,yyyy-mm-dd
6,dct.description,Facebook Paid ad data by age gender aggregated...
7,dct.format,parquet
8,dct.identifier,
9,dct.language,English







Unnamed: 0,column,dtype,description,primary_key,required
0,account_id,string,Account ID,,True
1,campaign_id,string,ID of the Campaign,,True
2,account_name,string,Name of the Ad Account,,True
3,actions_link_click,float,See Facebook Marketing API Documentation: http...,,
4,actions_page_engagement,float,See Facebook Marketing API Documentation: http...,,
5,actions_post_engagement,float,All actions that people take involving the ads...,,
6,actions_post_reaction,float,See Facebook Marketing API Documentation: http...,,
7,ad_id,string,Ad ID,,True
8,ad_name,string,Name of the ad,,True
9,age,string,Age range,,True
