**Description**     
In this notebook, I process the data obtained using the Google Analytics API.       
I create DataFrame from individual json files containing data about traffic of individual web pages.        
I then combine the individual DFs into a single DF, which I save to a new json file.        

**Required libraries**

In [79]:
import pandas as pd
import json
import os
import re

**Data processing**

*1) Functions definition*

In [80]:
def make_df(data):
    """Convert data to DataFrame and preprocess it."""
    df = pd.DataFrame(data)

    # Columns rename
    df.rename(columns={
    'activeUsers': 'active_users',
    'newUsers': 'new_users',
    'sessionsPerUser': 'sessions_per_user',
    'screenPageViews': 'screen_page_views',
    'engagedSessions': 'engaged_sessions',
    'averageSessionDuration': 'average_session_duration',
    }, inplace=True)
    
    # Data type setting
    df['year'] = df['year'].astype(int)
    df['month'] = df['month'].astype(int)
    df['active_users'] = df['active_users'].astype(int)
    df['new_users'] = df['new_users'].astype(int)
    df['sessions'] = df['sessions'].astype(int)
    df['sessions_per_user'] = df['sessions_per_user'].astype(float)
    df['screen_page_views'] = df['screen_page_views'].astype(int)
    df['engaged_sessions'] = df['engaged_sessions'].astype(int)
    df['average_session_duration'] = df['average_session_duration'].astype(float)

    return df

In [81]:
def filename_extraction(filename): 
    """Extract the domain name with the extension (.cz, .sk) from the filename"""
    # Regular expression for extracting domain name
    pattern = r'([a-zA-Z0-9\.\-]+(?:\.cz|\.sk))-\d{4}-\d{2}-\d{2}-\d{4}-\d{2}-\d{2}\.json'

    # Using a regular expression
    match = re.search(pattern, filename)

    if match:
        website = match.group(1)
        return website
    
    else:
        print("Website not found in filename:", filename)
        return None

*2) Start of processing*

In [82]:
# Path to the data folder
folder_path = '../3_data/raw'

# List of files in the folder
files = os.listdir(folder_path)

In [83]:
# Initialize empty list to store DataFrames
df_list = []

# Iteration through individual files
for filename in files:
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)

        # Create a DataFrame from the data
        df = make_df(data)

        # Extract website from filename
        website = filename_extraction(filename)

        # # Add the extracted domain name to the DataFrame
        if website:
            df.insert(0, 'website', website)

        # Append the DataFrame to the list
        df_list.append(df)

# Concatenate all DataFrames into a single one
final_df = pd.concat(df_list, ignore_index=True)

# Saving processed data
final_df.to_json("../3_data/processed_data/df_analytics.json", orient="records", indent=4)