In [5]:
import requests, os
from dotenv import load_dotenv

load_dotenv()

# Define constants
PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")
API_TOKEN = os.getenv("API_TOKEN")

# Define endpoints
REGISTRATION_DATA_URL = f"https://{REGION}-{PROJECT_ID}.cloudfunctions.net/getRegistrationData"
DEMOGRAPHIC_DATA_URL = f"https://{REGION}-{PROJECT_ID}.cloudfunctions.net/getDemographicData"

# Headers with API token for authentication
HEADERS = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json"
}

def fetch_all_data(url):
    """Fetches all data from the given URL, handling pagination."""
    all_data = []
    page_token = None

    while True:
        params = {"limit": 200}  # Adjust batch size as needed
        if page_token:
            params["pageToken"] = page_token

        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error fetching data from {url}: {response.status_code} - {response.text}")
            break

        data = response.json()
        all_data.extend(data.get("bookings", []))  # Adjust key based on endpoint response format

        # Check for next page
        page_token = data.get("nextPageToken")
        if not page_token:
            print(f"No more pages to fetch from {url}")
            break

    return all_data

In [6]:
registration_data = fetch_all_data(REGISTRATION_DATA_URL)
print(f"Fetched {len(registration_data)} registration records.")

No more pages to fetch from https://us-central1-boston-family-days---prod.cloudfunctions.net/getRegistrationData
Fetched 8017 registration records.


In [7]:
demographic_data = fetch_all_data(DEMOGRAPHIC_DATA_URL)
print(f"Fetched {len(demographic_data)} demographic records.")

No more pages to fetch from https://us-central1-boston-family-days---prod.cloudfunctions.net/getDemographicData
Fetched 5962 demographic records.


In [8]:
import pandas as pd

# Convert the fetched data into DataFrames
registration_df = pd.DataFrame(registration_data)
demographic_df = pd.DataFrame(demographic_data)

# Perform an outer join on the 'passId' column
merged_df = pd.merge(registration_df, demographic_df, on='passId', how='outer')

# Normalize email addresses by trimming spaces and converting to lowercase
merged_df['email'] = merged_df['email'].str.strip().str.lower()

# 1. Distribution of preferred languages (default to 'english' if empty)
merged_df['preferredCommunicationLanguage'] = merged_df['preferredCommunicationLanguage'].fillna('english')
language_counts = merged_df['preferredCommunicationLanguage'].value_counts()
language_percentages = (language_counts / language_counts.sum() * 100).round(1)
language_distribution = pd.DataFrame({
    'Count': language_counts,
    'Percentage': language_percentages
})
print("Preferred Language Distribution:")
print(language_distribution)

# 2. Count and percentage of duplicate students (same first and last name) for different email addresses
duplicate_students = merged_df.groupby(['firstName', 'lastName']).email.nunique()
duplicate_students_count = duplicate_students[duplicate_students > 1].sum()  # Total number of duplicate entries
total_entries = len(merged_df)
duplicate_students_percentage = ((duplicate_students_count * 2 / total_entries) * 100).round(1)
print("\nNumber of Duplicate Student Entries (same first and last name, different emails):")
print(f"Count: {duplicate_students_count}, Percentage: {duplicate_students_percentage:.1f}%")

# 3. Number and percentage of students under the same email addresses
students_per_email = merged_df.groupby('email').size()
email_distribution_counts = students_per_email.value_counts().sort_index()
email_distribution_percentages = (email_distribution_counts / email_distribution_counts.sum() * 100).round(1)
email_distribution = pd.DataFrame({
    'Count': email_distribution_counts,
    'Percentage': email_distribution_percentages
})
print("\nNumber of Students per Email Address:")
print(email_distribution)

Preferred Language Distribution:
                                Count  Percentage
preferredCommunicationLanguage                   
english                          7830        97.7
spanish-latin-american            135         1.7
mandarin                           20         0.2
haitian-creole                     11         0.1
portuguese-brazilian                7         0.1
cantonese                           5         0.1
vietnamese                          4         0.0
french-european                     2         0.0
arabic-standard                     1         0.0
somali                              1         0.0
cabo-verdean-creole                 1         0.0

Number of Duplicate Student Entries (same first and last name, different emails):
Count: 139, Percentage: 3.5%

Number of Students per Email Address:
   Count  Percentage
1   4244        72.2
2   1258        21.4
3    280         4.8
4     69         1.2
5     12         0.2
6      7         0.1
7      2         0.