In [None]:
# GRAPHRAG_LLM_MODEL=gpt-4o-mini
# GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES=organization,person,event,geo
# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=1
# GRAPHRAG_CLAIM_EXTRACTION_ENABLED=False
# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=1

## Data Preparation


In [2]:
import os
from pathlib import Path
import docx
import PyPDF2

# Create output folder if it doesn't exist
Path('podcasttxtdata').mkdir(parents=True, exist_ok=True)

# Process all files in input folder
for filename in os.listdir('podcastdata'):
    input_path = os.path.join('podcastdata', filename)
    
    # Skip if it's not a file
    if not os.path.isfile(input_path):
        continue
        
    # Get the file extension
    file_extension = filename.lower().split('.')[-1]
    
    # Create output filename (replace extension with .txt)
    output_filename = '.'.join(filename.split('.')[:-1]) + '.txt'
    output_path = os.path.join('podcasttxtdata', output_filename)
    
    try:
        # Process PDF files
        if file_extension == 'pdf':
            with open(input_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ''
                for page in pdf_reader.pages:
                    text += page.extract_text()
        
        # Process DOCX files
        elif file_extension in ['docx', 'doc']:
            doc = docx.Document(input_path)
            text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        
        # Save the extracted text
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Successfully converted {filename} to {output_filename}")
    
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

Successfully converted Behind the Tech_Episode 30_Collier_Bloomberg_Transcript.docx to Behind the Tech_Episode 30_Collier_Bloomberg_Transcript.txt
Successfully converted Behind the Tech_Episode 31_AshleyLlorens_Transcript.docx to Behind the Tech_Episode 31_AshleyLlorens_Transcript.txt
Successfully converted Behind the Tech_Episode 32_KimberlyBryant_Transcript.docx to Behind the Tech_Episode 32_KimberlyBryant_Transcript.txt
Successfully converted Behind the Tech_Episode 34_ChaseJarvis_Transcript.docx to Behind the Tech_Episode 34_ChaseJarvis_Transcript.txt
Successfully converted Behind the Tech_Episode 35_StevenBathiche_Transcript.docx to Behind the Tech_Episode 35_StevenBathiche_Transcript.txt
Successfully converted Behind the Tech_Episode 36_PeterLee_Transcript.docx to Behind the Tech_Episode 36_PeterLee_Transcript.txt
Successfully converted Behind the Tech_Episode 37_JustineEzarik_Transcript.docx to Behind the Tech_Episode 37_JustineEzarik_Transcript.txt
Successfully converted Behind

In [3]:
import os
from pathlib import Path
import tiktoken
import shutil
from tqdm import tqdm

# Initialize the GPT-4 tokenizer
tokenizer = tiktoken.get_encoding("o200k_base")  # This is the encoding used by gpt-4o, gpt-4o-mini
# tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

def count_tokens(text):
    """Count tokens in text using GPT-4 tokenizer"""
    return len(tokenizer.encode(text))

# Create dictionary to store file paths and their token counts
file_token_counts = {}

# Process all txt files
print("Counting tokens in files...")
for filename in tqdm(os.listdir('podcasttxtdata')):
    if filename.endswith('.txt'):
        filepath = os.path.join('podcasttxtdata', filename)
        try:
            # Read the text file
            with open(filepath, 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Count tokens
            token_count = count_tokens(text)
            
            # Store filepath and token count
            file_token_counts[filepath] = token_count
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

# Sort files by token count
sorted_files = sorted(file_token_counts.items(), key=lambda x: x[1])

# Print summary of all files and their token counts
print("\nToken count for each file (sorted from smallest to largest):")
for filepath, count in sorted_files:
    filename = os.path.basename(filepath)
    print(f"{filename}: {count} tokens")

# Create ragtest/input directory if it doesn't exist
ragtest_dir = Path('./ragtest/input')
ragtest_dir.mkdir(parents=True, exist_ok=True)

# Ask user how many files they want to copy
total_files = len(sorted_files)
print(f"\nTotal number of files: {total_files}")
num_files = int(input("How many of the smallest files do you want to copy to ragtest/input? "))

# Copy the smallest files
print(f"\nCopying {num_files} smallest files to ./ragtest/input/...")
for filepath, count in sorted_files[:num_files]:
    filename = os.path.basename(filepath)
    destination = os.path.join(ragtest_dir, filename)
    shutil.copy2(filepath, destination)
    print(f"Copied {filename} ({count} tokens)")

print("\nDone! Summary of copied files:")
total_tokens = sum(count for _, count in sorted_files[:num_files])
print(f"Total files copied: {num_files}")
print(f"Total tokens in copied files: {total_tokens}")
print(f"Average tokens per file: {total_tokens // num_files}")

Counting tokens in files...


100%|██████████| 62/62 [00:01<00:00, 60.10it/s]



Token count for each file (sorted from smallest to largest):
Behind-the-Tech_EP-38_StevenPinker_Transcript.txt: 0 tokens
Behind-the-Tech_EP39_2021_Year-in-Review_Transcript.txt: 0 tokens
EP-00_Behind_the_Tech_Kevin Scott_Podcast_Trailer.txt: 420 tokens
EP-20_ Behind_the_Tech_Kevin_Scott_Podcast_ReprogrammingTheDream.txt: 8200 tokens
Behind the Tech_Episode 30_Collier_Bloomberg_Transcript.txt: 8770 tokens
Ep-52_Behind_the_Tech_Kevin_Scott_Podcast_with_Bill_Gates-qc.txt: 9288 tokens
EP-28_Behind_the_Tech_Mae-Jemison.txt: 9618 tokens
Behind the Tech_Episode 35_StevenBathiche_Transcript.txt: 9825 tokens
EP-04_ Behind_the_Tech_Kevin Scott_Podcast_with_Judy Estrin.txt: 10106 tokens
EP-40_Behind_the_Tech_DanielaRus_Transcript (1).txt: 10370 tokens
EP-02_ Behind_the_Tech_Kevin Scott_Podcast_with_Alice Steinglass.txt: 10636 tokens
EP-49_Behind_the_Tech_Kevin_Scott_Podcast_2022YearinReview.txt: 10805 tokens
EP-03_ Behind_the_Tech_Kevin Scott_Podcast_with_Andrew Ng.txt: 10940 tokens
EP-24_ Behin

In [6]:
 (806465 /66597) * 0.13

1.5742518431761192

In [None]:
12.109629562893224 

In [4]:
num_files = len(sorted_files)
print("\nDone! Summary of  files:")
total_tokens = sum(count for _, count in sorted_files[:num_files])
print(f"Total files : {num_files}")
print(f"Total tokens in files: {total_tokens}")
print(f"Average tokens per file: {total_tokens // num_files}")


Done! Summary of  files:
Total files : 62
Total tokens in files: 806465
Average tokens per file: 13007


In [None]:
import pandas as pd
from neo4j import GraphDatabase
import time
from dotenv import load_dotenv
import os

# Specify the exact path to your .env file
ENV_PATH = r"./ragtest/.env"
load_dotenv(ENV_PATH)
NEO4J_URI="neo4j+s://c55a8a58.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD=os.getenv("NEO4J_PASSWORD") 
NEO4J_DATABASE="neo4j"
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

def db_query(cypher: str, params: Dict[str, Any] = {}) -> pd.DataFrame:
    """Executes a Cypher statement and returns a DataFrame"""
    return driver.execute_query(
        cypher, parameters_=params, result_transformer_=Result.to_df
    )

In [None]:
db_query(
  "MATCH (n:__Chunk__) RETURN n.n_tokens as token_count, count(*) AS count"
)
# token_count count
# 300         230
# 155         1

In [None]:
db_query(
  "MATCH (n:__Chunk__) RETURN n.n_tokens as token_count, count(*) AS count"
)
# token_count count
# 300         230
# 155         1

In [None]:
db_query(
  "MATCH (n:__Chunk__) RETURN n.n_tokens as token_count, count(*) AS count"
)
# token_count count
# 300         230
# 155         1

In [None]:
db_query("""
  MATCH (n:__Community__) 
  RETURN n.title AS title, n.summary AS summary, n.full_content AS full_content LIMIT 1
""")

In [None]:
entity_df = db_query(
    """
MATCH (d:__Chunk__)
RETURN count {(d)-[:HAS_ENTITY]->()} AS entity_count
"""
)
# Plot distribution
plt.figure(figsize=(10, 6))
sns.histplot(entity_df['entity_count'], kde=True, bins=15, color='skyblue')
plt.axvline(entity_df['entity_count'].mean(), color='red', linestyle='dashed', linewidth=1)
plt.axvline(entity_df['entity_count'].median(), color='green', linestyle='dashed', linewidth=1)
plt.xlabel('Entity Count', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Entity Count', fontsize=15)
plt.legend({'Mean': entity_df['entity_count'].mean(), 'Median': entity_df['entity_count'].median()})
plt.show()

In [None]:
degree_dist_df = db_query(
    """
MATCH (e:__Entity__)
RETURN count {(e)-[:RELATED]-()} AS node_degree
"""
)
# Calculate mean and median
mean_degree = np.mean(degree_dist_df['node_degree'])
percentiles = np.percentile(degree_dist_df['node_degree'], [25, 50, 75, 90])
# Create a histogram with a logarithmic scale
plt.figure(figsize=(12, 6))
sns.histplot(degree_dist_df['node_degree'], bins=50, kde=False, color='blue')
# Use a logarithmic scale for the x-axis
plt.yscale('log')
# Adding labels and title
plt.xlabel('Node Degree')
plt.ylabel('Count (log scale)')
plt.title('Node Degree Distribution')
# Add mean, median, and percentile lines
plt.axvline(mean_degree, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean_degree:.2f}')
plt.axvline(percentiles[0], color='purple', linestyle='dashed', linewidth=1, label=f'25th Percentile: {percentiles[0]:.2f}')
plt.axvline(percentiles[1], color='orange', linestyle='dashed', linewidth=1, label=f'50th Percentile: {percentiles[1]:.2f}')
plt.axvline(percentiles[2], color='yellow', linestyle='dashed', linewidth=1, label=f'75th Percentile: {percentiles[2]:.2f}')
plt.axvline(percentiles[3], color='brown', linestyle='dashed', linewidth=1, label=f'90th Percentile: {percentiles[3]:.2f}')
# Add legend
plt.legend()
# Show the plot
plt.show()

In [None]:
db_query("""
  MATCH (n:__Entity__) 
  RETURN n.name AS name, count{(n)-[:RELATED]-()} AS degree
  ORDER BY degree DESC LIMIT 5""")

In [None]:
community_data = db_query("""
  MATCH (n:__Community__)
  RETURN n.level AS level, count{(n)-[:IN_COMMUNITY]-()} AS members
""")

stats = community_data.groupby('level').agg(
    min_members=('members', 'min'),
    max_members=('members', 'max'),
    median_members=('members', 'median'),
    avg_members=('members', 'mean'),
    num_communities=('members', 'count'),
    total_members=('members', 'sum')
).reset_index()

# Create box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='level', y='members', data=community_data, palette='viridis')
plt.xlabel('Level')
plt.ylabel('Members')

# Add statistical annotations
for i in range(stats.shape[0]):
    level = stats['level'][i]
    max_val = stats['max_members'][i]
    text = (f"num: {stats['num_communities'][i]}\n"
            f"all_members: {stats['total_members'][i]}\n"
            f"min: {stats['min_members'][i]}\n"
            f"max: {stats['max_members'][i]}\n"
            f"med: {stats['median_members'][i]}\n"
            f"avg: {stats['avg_members'][i]:.2f}")
    plt.text(level, 85, text, horizontalalignment='center', fontsize=9)

plt.show()

In [None]:
index_name = "entity"

db_query(
    """
CREATE VECTOR INDEX """
    + index_name
    + """ IF NOT EXISTS FOR (e:__Entity__) ON e.description_embedding
OPTIONS {indexConfig: {
 `vector.dimensions`: 1536,
 `vector.similarity_function`: 'cosine'
}}
"""
)