# Custom Brain Analysis

This notebook allows you to perform your own analysis on the indexed data using the `load_data` function from the app.

In [24]:
import sys
import os
import pandas as pd
from pathlib import Path
import importlib

# Add the src directory to the python path
# This assumes the notebook is in 'notebooks/' and src is in the parent directory
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import the load_data function and ensure it's reloaded
import src.brain_analytics
importlib.reload(src.brain_analytics)
from src.brain_analytics import load_data



In [25]:
# Load the data
df = load_data()
df.head()

Unnamed: 0,filename,path,created_at,full_text,text_length,is_indexed
0,Industry Ethan Qiu Resume 2025.docx.pdf,/Users/seanqiu/Downloads/Industry Ethan Qiu Re...,2025-12-25 15:07:21.800684,"ETHAN QIU \nSan Francisco, CA | (415) 769-...",3676,True
1,Ethan Qiu Resume 2025 Normal.docx.pdf,/Users/seanqiu/Downloads/Ethan Qiu Resume 2025...,2025-12-25 14:57:54.651217,"ETHAN QIU\n San Francisco, CA | (415)...",3990,True
2,Visualization - Google Docs.pdf,/Users/seanqiu/Downloads/Visualization - Googl...,2025-12-25 15:43:19.762886,MVP Visualization Description: Architecture f...,1057,True


In [26]:
df['created_at'].dtype

dtype('<M8[us]')

In [16]:
# Basic Analysis: File Extensions
if df is not None:
    # Extract extension
    df['extension'] = df['filename'].apply(lambda x: os.path.splitext(x)[1].lower() if os.path.splitext(x)[1] else 'No Ext')
    
    # Count extensions
    ext_counts = df['extension'].value_counts()
    print("\nFile Extension Counts:")
    print(ext_counts)
    
    # Plot
    try:
        import plotly.express as px
        fig = px.bar(ext_counts, title="File Types Distribution")
        fig.show()
    except ImportError:
        print("Plotly not installed, skipping plot.")


File Extension Counts:
extension
.pdf    3
Name: count, dtype: int64


In [27]:
# Analysis: Download Time
if df is not None and 'created_at' in df.columns:
    print("\nDownload Times (First 5):")
    display(df[['filename', 'created_at']].head())
    
    # Plot downloads over time
    try:
        df['date'] = pd.to_datetime(df['created_at']).dt.date
        date_counts = df['date'].value_counts().sort_index()
        
        import plotly.express as px
        fig = px.bar(date_counts, title="Downloads per Day")
        fig.show()
    except Exception as e:
        print(f"Could not plot timeline: {e}")


Download Times (First 5):


Unnamed: 0,filename,created_at
0,Industry Ethan Qiu Resume 2025.docx.pdf,2025-12-25 15:07:21.800684
1,Ethan Qiu Resume 2025 Normal.docx.pdf,2025-12-25 14:57:54.651217
2,Visualization - Google Docs.pdf,2025-12-25 15:43:19.762886
