# Custom Brain Analysis

This notebook allows you to perform your own analysis on the indexed data using the `load_data` function from the app.

In [6]:
import sys
import os
import pandas as pd
from pathlib import Path
import importlib

# Add the src directory to the python path
# This assumes the notebook is in 'notebooks/' and src is in the parent directory
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import the load_data function and ensure it's reloaded
import src.brain_analytics
importlib.reload(src.brain_analytics)
from src.brain_analytics import load_data



In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
df = load_data()
df.head()

Unnamed: 0,filename,path,created_at,full_text,text_length,is_indexed
0,Industry Ethan Qiu Resume 2025.docx.pdf,/Users/seanqiu/Downloads/Industry Ethan Qiu Re...,2025-12-25 15:07:21.800684,"ETHAN QIU \nSan Francisco, CA | (415) 769-...",3676,True
1,Ethan Qiu Resume 2025 Normal.docx.pdf,/Users/seanqiu/Downloads/Ethan Qiu Resume 2025...,2025-12-25 14:57:54.651217,"ETHAN QIU\n San Francisco, CA | (415)...",3990,True
2,Visualization - Google Docs.pdf,/Users/seanqiu/Downloads/Visualization - Googl...,2025-12-25 15:43:19.762886,MVP Visualization Description: Architecture f...,1057,True
3,Screen Shot 2021-06-14 at 9.34.33 PM.png,/Users/seanqiu/Downloads/Screen Shot 2021-06-1...,2025-12-25 17:16:56.835665,What is a CIT?\nThe Mayor's Youth Employment &...,1289,True
4,Screen Shot 2023-01-16 at 2.25.36 PM.png,/Users/seanqiu/Downloads/Screen Shot 2023-01-1...,2025-12-25 17:29:11.315894,"Lowell HS\nMr. Michael Jones, Principal\n1101 ...",666,True


In [None]:
# Prepare date column for grouping
if df is not None and 'created_at' in df.columns:
    df['date'] = pd.to_datetime(df['created_at']).dt.date

In [None]:
# Aggregate counts by date and extension (for barplot)
if df is not None and 'date' in df.columns:
    counts = df.groupby(['date', 'extension']).count().reset_index()
    counts.head()

In [None]:
# Create complete date-extension grid for lineplot
if df is not None and 'date' in df.columns:
    all_dates = sorted(pd.Series(df['date']).unique())
    all_exts = sorted(pd.Series(df['extension']).unique())
    idx = pd.MultiIndex.from_product([all_dates, all_exts], names=['date', 'extension'])
    counts_full = df.groupby(['date', 'extension']).size().reindex(idx, fill_value=0).reset_index(name='file_count')
    counts_full.head()

In [3]:
df['created_at'].dtype

dtype('<M8[us]')

In [4]:
# Basic Analysis: File Extensions
if df is not None:
    # Extract extension
    df['extension'] = df['filename'].apply(lambda x: os.path.splitext(x)[1].lower() if os.path.splitext(x)[1] else 'No Ext')
    
    # Count extensions
    ext_counts = df['extension'].value_counts()
    print("\nFile Extension Counts:")
    print(ext_counts)
    
    # Plot
    try:
        import plotly.express as px
        fig = px.bar(ext_counts, title="File Types Distribution")
        fig.show()
    except ImportError:
        print("Plotly not installed, skipping plot.")


File Extension Counts:
extension
.pdf     55
.png      6
.jpg      3
.docx     2
.py       1
.txt      1
.jpeg     1
Name: count, dtype: int64


In [27]:
# Analysis: Download Time
if df is not None and 'created_at' in df.columns:
    print("\nDownload Times (First 5):")
    display(df[['filename', 'created_at']].head())
    
    # Plot downloads over time
    try:
        df['date'] = pd.to_datetime(df['created_at']).dt.date
        date_counts = df['date'].value_counts().sort_index()
        
        import plotly.express as px
        fig = px.bar(date_counts, title="Downloads per Day")
        fig.show()
    except Exception as e:
        print(f"Could not plot timeline: {e}")


Download Times (First 5):


Unnamed: 0,filename,created_at
0,Industry Ethan Qiu Resume 2025.docx.pdf,2025-12-25 15:07:21.800684
1,Ethan Qiu Resume 2025 Normal.docx.pdf,2025-12-25 14:57:54.651217
2,Visualization - Google Docs.pdf,2025-12-25 15:43:19.762886


In [None]:
# Seaborn Bar Plot: Files per Day by Extension
if 'counts' in globals():
    plt.figure(figsize=(10, 6))
    sns.barplot(data=counts, x='date', y='filename', hue='extension')
    plt.title('Files per Day by Extension (Counts)')
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
# Seaborn Line Plot: Files per Day by Extension (Zero-filled)
if 'counts_full' in globals():
    plot_data = counts_full.copy()
    plot_data['date'] = plot_data['date'].astype(str)  # Discrete labels
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=plot_data, x='date', y='file_count', hue='extension', marker='o')
    plt.title('Files per Day by Extension (Zero-filled)')
    plt.grid(True, alpha=0.3)
    plt.show()

In [50]:
from matplotlib import cm, colors as mcolors
threshold = 0.6
def sim_to_width(s, t=threshold):
    return float(1 + 7 * max(0.0, (s - t)) / max(1e-9, (1.0 - t)))
cmap = cm.get_cmap('viridis')
def sim_to_hex(s, t=threshold):
    x = max(0.0, (s - t)) / max(1e-9, (1.0 - t))
    return mcolors.rgb2hex(cmap(x))


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.



In [None]:
import networkx as nx
from pyvis.network import Network
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import duckdb
from app import DB_PATH
from itertools import combinations

# 1. Fetch embeddings from DuckDB
# TODO: Connect to 'brain.duckdb' in READ_ONLY mode
# TODO: Select 'filename' and 'embedding' from 'files_index' where embedding IS NOT NULL
# Hint: conn = duckdb.connect(path, config={'access_mode': 'READ_ONLY'})
#       df_embeddings = conn.execute(sql).fetchdf(); conn.close()

conn = duckdb.connect(database= DB_PATH, read_only=True)
df_embeddings = conn.execute("SELECT filename, embedding from files_index where embedding " \
"is not null;").fetchdf()
conn.close()


# 2. Compute Similarity Matrix
# TODO: Convert the 'embedding' column (list of floats) to a numpy array via np.stack
# TODO: Calculate cosine_similarity(embeddings) to get an N x N matrix
df_embeddings.dropna(subset = ['embedding'])
embedding_list = df_embeddings['embedding'].to_list()
embeddings = np.stack(embedding_list)
similarity = cosine_similarity(embeddings)

# 3. Build the Graph
G = nx.Graph()
threshold = 0.6  # Connect files if they are >60% similar

# TODO: Add each file as a node with human-friendly labels
# Hint: 
G.add_nodes_from(df_embeddings['filename'].to_list())

# TODO: Add edges between pairs whose similarity exceeds threshold
# Hint: )
for i, j in combinations(range(len(similarity[0])), 2): 
    score = similarity[i, j]
    if score > threshold:
        G.add_edge(
            str(df_embeddings['filename'][i]), 
            str(df_embeddings['filename'][j]),
            weight = float(score), 
            value = sim_to_width(score),
            color = sim_to_hex(score), 
            title = f"score={score:.3f}"
            )
# 4. Visualize with PyVis
# TODO: Create a Network object and import from NetworkX
# Hint: net = Network(notebook=True, height="600px", width="100%")
#       net.from_nx(G)
#       net.toggle_physics(True)
#       net.show("file_graph.html")
net = Network(notebook=True, bgcolor= "#222222", height="600px", width="100%", font_color="white")
net.from_nx(G)
net.toggle_physics(True)
net.toggle_drag_nodes(True)
net.show("file_graph.html")

file_graph.html



divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul

