In [None]:
import json
import os
import pandas as pd
import requests
import threading
import time
from dotenv import load_dotenv
from mixpanel import Mixpanel

# ------------
# INSTRUCTIONS
#
# Set the from_date and to_date variables to define the frame of data you want to load.
# Then, run this cell. The first time you run it, it will get all the Mixpanel events
# in the time range, then it will save that data to a json file in /app/analytics/.
# The next time you run this cell, it will load from the json file (unless you change the dates).
#
# ------------
# Helpful Dates
#
# Completed pilots:
#   LA LWC May run:           2025-05-18 - 2025-06-30
#   AZ Constrained MAC Pilot: 2025-06-13 - 2025-08-13 7:29pm MST
# Incomplete pilots:
#   LA LWC August Run:        2025-08-17            - ??? Final day to query before publish date of report
#   AZ Expanded MAC Pilot:    2025-08-13 7:30pm MST - ??? Final day to query before publish date of report
#
# -------------

from_date = '2025-05-18'
to_date = '2025-06-30'
client_agency = 'la_ldh'

# Credentials
project_root = os.path.dirname(os.getcwd())
dotenv_path = os.path.join(project_root, '.env.local')
load_dotenv(dotenv_path=dotenv_path)
SA_USERNAME = os.getenv("MIXPANEL_SERVICE_ACCOUNT_USERNAME").strip()
SA_SECRET = os.getenv("MIXPANEL_SERVICE_ACCOUNT_SECRET").strip()
PROJECT_ID = os.getenv("MIXPANEL_PROJECT_ID").strip()

# Endpoint
API_ENDPOINT = 'https://data.mixpanel.com/api/2.0/export'
headers = {"accept": "text/plain"}

# Date range
file_name = f"mixpanel_data_{from_date}_to_{to_date}.json"
params = {
    'from_date': from_date,
    'to_date': to_date,
    'project_id': PROJECT_ID
}

# Loading animation setup
is_loading = False

def spinning_cursor():
    while is_loading:
        for cursor in '|/-\\':
            print(f"\r{cursor}", end="", flush=True)
            time.sleep(0.1)
    # Clear the spinner line after loading is complete
    print("\r" + " " * 20 + "\r", end="", flush=True)

# Check if the data has already been downloaded
if os.path.exists(file_name):
    print(f"Loading data from local file: {file_name}")
    is_loading = True
    spinner_thread = threading.Thread(target=spinning_cursor)
    spinner_thread.start()
    
    with open(file_name, 'r') as f:
        raw_response_text = f.read()
    
    is_loading = False
    spinner_thread.join()
else:
    is_loading = True
    spinner_thread = threading.Thread(target=spinning_cursor)
    spinner_thread.start()
    
    try:
        # Make the request
        print("Fetching data from Mixpanel...")
        response = requests.get(
            API_ENDPOINT,
            headers=headers,
            params=params,
            auth=(SA_USERNAME, SA_SECRET)
        )
        
        is_loading = False
        spinner_thread.join()
        
        if response.status_code == 200:
            print("Successfully fetched data from API.")
            raw_response_text = response.text
            # Save the raw response to the file for future use
            with open(file_name, 'w') as f:
                f.write(raw_response_text)
            print(f"Data saved to {file_name}")
        else:
            print(f"Error: API request failed with status code {response.status_code}")
            print(response.text)
            exit()
    except:
        is_loading = False
        spinner_thread.join()
        print(f"\nAn error occurred during the API request: {e}")
        exit()

# Process the data (either from the file or the fresh API call)
if raw_response_text:
    raw_data = []
    lines = raw_response_text.strip().split('\n')
    for line in lines:
        if not line:
            continue
        try:
            raw_data.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Warning: Could not decode line: {line}")

    if raw_data:
        df = pd.DataFrame(raw_data)
        print(f"Successfully processed {len(df)} events.")
    else:
        print("No data was processed.")
else:
    print("No data available to process.")

if len(client_agency) > 0:
    # Filter down to events from a specific state pilot
    mask = df['properties'].apply(lambda p: p.get('client_agency_id') == client_agency)
    df = df[mask]
    
    print(f"Found {len(df)} events with client_agency_id set to", client_agency)

In [None]:
# Users who shared the income summary
income_events_df = df[df['event'] == 'ApplicantSharedIncomeSummary']
users_who_shared_pdf = income_events_df['properties'].apply(lambda p: p.get('distinct_id')).unique()
print(f"Found {len(users_who_shared_pdf)} users with at least one 'ApplicantSharedIncomeSummary' event.")

# Create a df of just these users
mask = df['properties'].apply(lambda p: p.get('distinct_id')).isin(users_who_shared_pdf)
users_who_shared_pdf_df = df[mask]

# Find all the Finished*Sync events for these users (these are the "mega events")
sync_events = ['ApplicantFinishedArgyleSync', 'ApplicantFinishedPinwheelSync']
sync_events_df = users_who_shared_pdf_df[users_who_shared_pdf_df['event'].isin(sync_events)].copy()

# Extract the distinct_id and timestamp from the properties dictionary.
sync_events_df['distinct_id'] = sync_events_df['properties'].apply(lambda p: p.get('distinct_id'))
sync_events_df['timestamp'] = pd.to_datetime(sync_events_df['properties'].apply(lambda p: p.get('timestamp')))

# Sort the DataFrame by user and then by time to ensure the most recent event is last
sync_events_df = sync_events_df.sort_values(by=['distinct_id', 'timestamp'])

# De-duplicate and keep only the most recent event for each user
latest_sync_per_user_df = sync_events_df.drop_duplicates(subset='distinct_id', keep='last').copy()

# Extract the final counts from this de-duplicated DataFrame
latest_sync_per_user_df['total_w2_count'] = latest_sync_per_user_df['properties'].apply(lambda p: p.get('employment_type_w2_count', 0))
latest_sync_per_user_df['total_gig_count'] = latest_sync_per_user_df['properties'].apply(lambda p: p.get('employment_type_gig_count', 0))

# Create a DataFrame with the user as the index
user_employment_counts = latest_sync_per_user_df.set_index('distinct_id')[['total_w2_count', 'total_gig_count']]

# Df of users who have at least one job
users_with_jobs_df = user_employment_counts[
    (user_employment_counts['total_w2_count'] + user_employment_counts['total_gig_count'] > 0)
]

print("\nW2 and Gig counts per user: ")
print(user_employment_counts)
print(f"\nFound {len(users_with_jobs_df)} users with at least one W2 or gig source.")
print(users_with_jobs_df.head())

users_with_one_job_df = user_employment_counts[
    ((user_employment_counts['total_w2_count'] + user_employment_counts['total_gig_count'] == 1))
]
print(f"\nFound {len(users_with_one_job_df)} users with exactly one job.")

users_with_two_jobs_df = user_employment_counts[
    ((user_employment_counts['total_w2_count'] + user_employment_counts['total_gig_count'] == 2))
]
print(f"\nFound {len(users_with_two_jobs_df)} users with exactly two jobs.")

users_with_three_jobs_df = user_employment_counts[
    ((user_employment_counts['total_w2_count'] + user_employment_counts['total_gig_count'] == 3))
]
print(f"\nFound {len(users_with_three_jobs_df)} users with exactly three jobs.")

users_with_gigs_df = user_employment_counts[
    (user_employment_counts['total_gig_count'] > 0)
]
print(f"\nFound {len(users_with_gigs_df)} users with gigs.")

In [None]:
# For debugging, print out the matching events for a single user
specific_user_id = "applicant-62230"
print(f"--- Investigating all sync events for user: {specific_user_id} ---")
single_user_sync_events = sync_events_df[sync_events_df['distinct_id'] == specific_user_id]
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 100, 'display.max_colwidth', 100):
    print(single_user_sync_events[['event', 'properties']].T)

In [None]:
# How many shared events does each user have?
# Extract the distinct_id for each of these events
event_user_ids = income_events_df['properties'].apply(lambda p: p.get('distinct_id'))

# Count the occurrences of each user ID
user_event_counts = event_user_ids.value_counts()

# Filter this list to find users who appear more than once
users_with_multiple_shares = user_event_counts[user_event_counts > 1]
users_with_one_share = user_event_counts[user_event_counts == 1]
users_with_many_shares = user_event_counts[user_event_counts > 3]

print(users_with_one_share)
print(users_with_multiple_shares)
print(users_with_many_shares)

In [None]:
import pprint

# See one user's events
target_user_id = 'applicant-100114' 
temp_df = users_who_shared_pdf_df.copy()
temp_df['distinct_id'] = temp_df['properties'].apply(lambda p: p.get('distinct_id'))
temp_df['timestamp'] = pd.to_datetime(
    temp_df['properties'].apply(lambda p: p.get('time') or p.get('timestamp'))
)

user_events_df = temp_df[temp_df['distinct_id'] == target_user_id]
sorted_user_events = user_events_df.sort_values(by='timestamp')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

print(f"Showing all events for user '{target_user_id}', sorted by time:")
print(sorted_user_events[['timestamp', 'event', 'properties']])
# row_index = 382646
# properties_data = temp_df.loc[row_index, 'properties']

# # Use pprint to print it in a nicely formatted way
# print(f"Properties for row {row_index}:")
# pprint.pprint(properties_data)

pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')