In [8]:
import json
import os
import pandas as pd
import requests
import threading
import time
from dotenv import load_dotenv
from mixpanel import Mixpanel

# ------------
# INSTRUCTIONS
#
# Set the from_date and to_date variables to define the frame of data you want to load.
# Then, run this cell. The first time you run it, it will get all the Mixpanel events
# in the time range, then it will save that data to a json file in /app/analytics/.
# The next time you run this cell, it will load from the json file (unless you change the dates).
# ------------

# Credentials
project_root = os.path.dirname(os.getcwd())
dotenv_path = os.path.join(project_root, '.env.local')
load_dotenv(dotenv_path=dotenv_path)
SA_USERNAME = os.getenv("MIXPANEL_SERVICE_ACCOUNT_USERNAME").strip()
SA_SECRET = os.getenv("MIXPANEL_SERVICE_ACCOUNT_SECRET").strip()
PROJECT_ID = os.getenv("MIXPANEL_PROJECT_ID").strip()

# Endpoint
API_ENDPOINT = 'https://data.mixpanel.com/api/2.0/export'
headers = {"accept": "text/plain"}

# Date range
from_date = '2025-08-01'
to_date = '2025-08-07'
file_name = f"mixpanel_data_{from_date}_to_{to_date}.json"
params = {
    'from_date': from_date,
    'to_date': to_date,
    'project_id': PROJECT_ID
}

# --- Loading Animation ---
is_loading = False

def spinning_cursor():
    """ A simple spinning cursor animation. """
    while is_loading:
        for cursor in '|/-\\':
            print(f"\rFetching data... {cursor}", end="", flush=True)
            time.sleep(0.1)
    # Clear the spinner line after loading is complete
    print("\r" + " " * 20 + "\r", end="", flush=True)

# Check if the data has already been downloaded
if os.path.exists(file_name):
    print(f"Loading data from local file: {file_name}")
    with open(file_name, 'r') as f:
        raw_response_text = f.read()
else:
    is_loading = True
    spinner_thread = threading.Thread(target=spinning_cursor)
    spinner_thread.start()
    
    try:
        # Make the request
        response = requests.get(
            API_ENDPOINT,
            headers=headers,
            params=params,
            auth=(SA_USERNAME, SA_SECRET)
        )
        
        is_loading = False
        spinner_thread.join()
        
        if response.status_code == 200:
            print("Successfully fetched data from API.")
            raw_response_text = response.text
            # Save the raw response to the file for future use
            with open(file_name, 'w') as f:
                f.write(raw_response_text)
            print(f"Data saved to {file_name}")
        else:
            print(f"Error: API request failed with status code {response.status_code}")
            print(response.text)
            exit()
    except:
        is_loading = False
        spinner_thread.join()
        print(f"\nAn error occurred during the API request: {e}")
        exit()

# Process the data (either from the file or the fresh API call)
if raw_response_text:
    raw_data = []
    lines = raw_response_text.strip().split('\n')
    for line in lines:
        if not line:
            continue
        try:
            raw_data.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Warning: Could not decode line: {line}")

    if raw_data:
        df = pd.DataFrame(raw_data)
        print(f"Successfully processed {len(df)} events.")
        print("\n--- DataFrame Head ---")
        print(df.head())
        print("\n--- DataFrame Info ---")
        df.info()
    else:
        print("No data was processed.")
else:
    print("No data available to process.")

Loading data from local file: mixpanel_data_2025-08-01_to_2025-08-07.json
Successfully processed 30501 events.

--- DataFrame Head ---
                               event  \
0                  ApplicantTimedOut   
1                  ApplicantTimedOut   
2                  ApplicantTimedOut   
3                        CbvPageView   
4  ApplicantClickedCBVInvitationLink   

                                          properties  
0  {'time': 1754185188, 'distinct_id': '174667835...  
1  {'time': 1754185223, 'distinct_id': '174667835...  
2  {'time': 1754208190, 'distinct_id': '174667835...  
3  {'time': 1754583172, 'distinct_id': 'applicant...  
4  {'time': 1754583172, 'distinct_id': 'applicant...  

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30501 entries, 0 to 30500
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   event       30501 non-null  object
 1   properties  30501 non-null  object
dt