In [6]:
import asyncio
import tiktoken
import os

from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from dotenv import find_dotenv, load_dotenv

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

## 1. Checking the number of tokens

In [14]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def truncate_text_to_max_tokens(text: str, max_tokens: int = 4000, encoding_name: str = "gpt-3.5-turbo-16k") -> str:
    """Truncate text from the file to a maximum number of tokens."""
    
    
    current_num_tokens = num_tokens_from_string(text, encoding_name)

    if current_num_tokens > max_tokens:
        print(f'Text truncated, num tokens: {current_num_tokens}')
        encoding = tiktoken.encoding_for_model(encoding_name)
        token_list = encoding.encode(text)
        truncated_token_list = token_list[:max_tokens]
        truncated_text = encoding.decode(truncated_token_list)
    else:
        print(f'Text not truncated, num tokens: {current_num_tokens}')
        truncated_text = text

    return truncated_text

In [18]:
input_directory = 'transcripts'
exceeding_limit = {}
for filename in os.listdir(input_directory):
    full_path = os.path.join(input_directory, filename)
    with open(full_path) as f:
        text = f.read()

    current_tokens = num_tokens_from_string(text, "gpt-3.5-turbo-16k")

    if current_tokens > 16000:
        exceeding_limit[filename] = current_tokens
        print(filename)

Episode - Balaji Srinivasan_ How to Fix Government, Twitter, Science, and the FDA _ Lex Fridman Podcast #331 Segment - Fixing the FDA (3_25_06-4_56_14).txt


In [19]:
exceeding_limit

{'Episode - Balaji Srinivasan_ How to Fix Government, Twitter, Science, and the FDA _ Lex Fridman Podcast #331 Segment - Fixing the FDA (3_25_06-4_56_14).txt': 19260}

## 2. Summarizing

In [8]:
prompt_template = """You are a podcast summarization expert. Your outputs are easy to read-through, concise and actionable.

As an input you will receive a transcript of the podcast section.

Your task is to create concise and in-depth summary based on the transcript that you will receive. 
Compress as little information as possible.
If there is any information that should be added, please do so. 

Your answer will be presented to an audience interested in science, technology, artificial intelligence and self-improvement.

Here is the transcript:


{text}


SUMMARY IN BULLET POINTS (UP TO 15 POINTS):"""

In [20]:
async def async_summarize_file(chain, full_path, save_path):
    with open(full_path) as f:
        text = f.read()
    text = truncate_text_to_max_tokens(text)
    doc = [Document(page_content=text)]
    output_summary = await chain.arun(doc)
    with open(save_path, "w") as f:
        f.write(output_summary)

async def summarize_files_from_directory(input_directory, output_directory, prompt_template, model_name="gpt-3.5-turbo-16k", batch_size=30):
    llm = ChatOpenAI(model_name=model_name)
    BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template,
                                         input_variables=["text"])
    chain = load_summarize_chain(llm,
                             chain_type="stuff",
                             prompt=BULLET_POINT_PROMPT)

    # Get a list of valid files 
    # - Those not starting with a dot
    # - Those not already summarized
    valid_files = [
        filename for filename in os.listdir(input_directory) 
        if not filename.startswith('.') 
        and not os.path.exists(os.path.join(output_directory, f'(Summary) {filename}'))
    ]

    print(f"Files to summarize: {len(valid_files)}")
    
    # Process files in batches
    for i in range(0, len(valid_files), batch_size):
        batch = valid_files[i: i + batch_size]
        tasks = []

        for filename in batch:
            full_path = os.path.join(input_directory, filename)
            save_path = os.path.join(output_directory, f'(Summary) {filename}')
            print(f'Summarizing: {filename}')
            task = asyncio.create_task(async_summarize_file(chain, full_path, save_path))
            tasks.append(task)
        
        # Wait for current batch to complete before proceeding to the next
        await asyncio.gather(*tasks)


In [22]:
input_dir = 'transcripts'
output_dir = 'summaries'

await summarize_files_from_directory(input_dir, output_dir, prompt_template)