**Description**     
The notebook contains the code to access the Google Analytics API and download the data.    
The downloaded data is checked, processed and saved as a json file.     

**Required libraries**

In [63]:
# Save files
import json
import os

# Google Analytics Modules and Libraries
from google.oauth2 import service_account 
from google.analytics.data_v1beta import BetaAnalyticsDataClient 
from google.analytics.data_v1beta.types import ( 
    DateRange,
    Dimension,
    Metric,
    MetricType,
    RunReportRequest,
    OrderBy,
)

**Access data to API and our websites IDs**

In [64]:
# Path to the login file (JSON service account key)
key_file = '../1_access_data/data-project-438907-2e1111c8bd46.json'

# Property ID (22 websites)
with open('../1_access_data/websites_id.json', 'r', encoding='utf-8') as json_file:
    websites_id = json.load(json_file)

# Authentication by service account
scopes = ['https://www.googleapis.com/auth/analytics.readonly']
credentials = service_account.Credentials.from_service_account_file(key_file, scopes=scopes)

# Service API initialization
client = BetaAnalyticsDataClient(credentials=credentials)

**Input request parameters**  

In [65]:
# Specify the date range
start_date = '2023-11-01'
end_date = '2024-11-30'
date_range = [
    DateRange(
    start_date=start_date,
    end_date=end_date
)]

# Specify the dimensions
dimensions = [
    Dimension(name='year'),
    Dimension(name='month'),
]

# Specify the metrics
metrics = [
    Metric(name='activeUsers'), # The number of distinct users who visited website during a specified time period
    Metric(name='newUsers'), # The number of users who interacted with website for the first time during a specified time period
    Metric(name='sessions'), # The total number of individual sessions initiated by users during a specified time period
    Metric(name='sessionsPerUser'), # The average number of sessions per user within the specified time period
    Metric(name='screenPageViews'), # The number of page views during a specified time period (repeated views of a single page are counted)
    Metric(name='engagedSessions'), # The number of sessions that lasted longer than 10 seconds, or had a key event, or had 2 or more screen views
    Metric(name='averageSessionDuration'), # The average time users spend in a session, measured in seconds
]

# Specify the order in which the data will be returned.
order_by = [
    OrderBy(dimension={'dimension_name': 'year'}, desc=False),
    OrderBy(dimension={'dimension_name': 'month'}, desc=False),
]

**Data extraction**

*1) Functions definition*

In [66]:
def get_report(client, id):
    """Create the request with the requests parameters.
    Send the request to the API"""
    request = RunReportRequest(
        property=f'properties/{id}',
        date_ranges=date_range,
        dimensions=dimensions,
        metrics=metrics,
        order_bys=order_by,
    ) 
    
    response = client.run_report(request)
    return response

In [67]:
def check_response(response):
    """Checking the content of API response object"""
    print('The response rowcount: ', response.row_count, '\n')

    print('The response dimension headers:')
    for header in response.dimension_headers:
        print(' ', header.name)

    print('\nThe API response metric headers:')
    for header in response.metric_headers:
        print(' ', header.name)

    print('\nSample data rows:')
    for row in response.rows[:5]:
        dimensions = [dim.value for dim in row.dimension_values]
        metrics = [metric.value for metric in row.metric_values]
        print(' Dimensions:', dimensions, ' Metrics:', metrics)

In [68]:
def process_response(response):
    """Process the API response to extract the data and return a list of dictionaries."""
    dim_len = len(response.dimension_headers)
    metric_len = len(response.metric_headers)

    raw_data = []

    # Process each row in the response
    for row in response.rows:
        row_data = {}
        # Extract dimension values
        for i in range(dim_len):
            dim_name = response.dimension_headers[i].name
            dim_value = row.dimension_values[i].value
            row_data[dim_name] = dim_value
        # Extract metric values
        for i in range(metric_len):
            metric_name = response.metric_headers[i].name
            metric_value = row.metric_values[i].value
            row_data[metric_name] = metric_value
        
        raw_data.append(row_data)

    return raw_data

In [69]:
def save_raw_data(response, file_path):
    """Saves the row data to JSON file"""
    with open(file_path, mode='w', encoding='utf-8') as file:
        json.dump(response, file, default=str, indent=4)

In [70]:
def check_file_saved(file_path):
    """Check if the file was saved successfully."""
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        print(f'File {file_path} saved successfully.')
    else:
        print(f'Error saving file: {file_path}')

*2) Getting data from all our websites*

In [None]:
for key, value in websites_id.items():
    # Get API response
    response = get_report(client, value)

    # Check response
    print(f'\n{key}')
    check_response(response)

    # Response processing
    raw_data = process_response(response)

    # Save to JSON
    save_path = f'../3_data/raw/{key}-{start_date}-{end_date}.json'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    save_raw_data(raw_data, save_path)

    # File save check
    check_file_saved(save_path)