# Local LLM X LangChain 

Models to try out: 
1. Koala7b-8bit

In [4]:
from langchain import PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

In [5]:
import textwrap
import os
import json
import gc
import torch

In [6]:
# Util functions 

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def read_transcript(filename): 
    with open(filename) as f: 
        data = json.load(f) 
        data = data["text"]
        return data 
    
def extract_title(file_path): 
    last_slash_idx = file_path.rfind('/')
    filename_with_extension = file_path[last_slash_idx+1:]
    filename_without_extension = filename_with_extension.split('.')[0]
    return filename_without_extension

In [15]:
# Load Model 
from langchain.llms import HuggingFacePipeline
from langchain.llms.fake import FakeListLLM
from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline
from tqdm.notebook import tqdm

def load_model(MODEL_NAME): 
    if MODEL_NAME == "koala-7b": 
        tokenizer = LlamaTokenizer.from_pretrained("samwit/koala-7b")

        base_model = LlamaForCausalLM.from_pretrained(
            "samwit/koala-7b",
            load_in_8bit=True,
            device_map='auto',
        )

        pipe = pipeline(
            "text-generation",
            model=base_model, 
            tokenizer=tokenizer, 
            max_length=1024,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.15
        )

        llm = HuggingFacePipeline(pipeline=pipe)

        return llm 
    
    else: # load fake LLM 
        responses=[
            "Action: Python REPL\nAction Input: print(2 + 2)",
            "Final Answer: 4"
        ]
        llm = FakeListLLM(responses=responses)

        return llm 

In [16]:
MODEL_NAME = "koala-7b"
llm = load_model(MODEL_NAME)

Loading checkpoint shards: 100%|████████████████| 14/14 [00:51<00:00,  3.68s/it]


# Base Prompt 

In [19]:
base_prompt = """ This is a podcast series on financial literacy in Singapore. It is called The Financial Coconuts.\n 

The title for this episode is: 

{title}

# Write a concise summary of the podcast: \n

{text}

Only include facts from the context. \n 
CONCISE SUMMARY:"""

BASE_PROMPT = PromptTemplate(template=base_prompt, 
                        input_variables=["text", "title"])

### Sample File Loading

In [9]:
transcript_filepath = "/home/munfai98/Documents/fc-transcripts/financial-coconut-transcripts/cleaned_transcripts/Is Budgeting The Only Way To Manage Your Money.json"
data = read_transcript(transcript_filepath)
title = extract_title(transcript_filepath)

In [10]:
print(title)

Is Budgeting The Only Way To Manage Your Money


In [32]:
text_splitter = CharacterTextSplitter(separator=". ", chunk_size=5000, chunk_overlap=20)

texts = text_splitter.split_text(data)
# print(texts)
docs = [Document(page_content=t) for t in texts]
print(len(docs))

7


In [33]:
chain = load_summarize_chain(llm, 
                            chain_type="map_reduce",
                            map_prompt=BASE_PROMPT, 
                            combine_prompt=BASE_PROMPT)

In [34]:
output_summary = chain.run(input_documents=docs, title=title)
wrapped_text = textwrap.fill(output_summary, 
                            width=100,
                            break_long_words=False,
                            replace_whitespace=False)
print(wrapped_text)



 

This episode of The Financial Coconuts features two guests who share their personal experiences
and strategies for managing money effectively. One guest talks about his approach to budgeting and
managing his family's finances, including using a combination of manual tracking and app-based
tools. Another guest shares his personal financial mistakes and successes, including his best
investment of reading up on investment books and taking courses. Both guests encourage listeners to
explore other resources in the field of personal finance and investment.


In [37]:
torch.cuda.empty_cache()
gc.collect()

!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed Apr 19 12:13:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:07:00.0  On |                  Off |
| 30%   45C    P8    31W / 450W |   9554MiB / 24564MiB |     24%      Default |
|                               |   

# Pipeline 

I hope to build a pipeline that goes through all the different transcript files. 

There are 2 main challenges: 
1. OOM issues 
2. Writing results and not losing them. 

In [48]:
path = '/home/munfai98/Documents/fc-transcripts/financial-coconut-transcripts/cleaned_transcripts/'
dir_list = [path + i for i in os.listdir(path)]

print(len(dir_list))

19


In [60]:
summary_filepath = "/home/munfai98/Documents/fc-transcripts/financial-coconut-transcripts/summary/generated_summaries.json"
skip_list = ['Is Budgeting The Only Way To Manage Your Money'] #completed transcripts
try:
    with open(summary_filepath) as f:
        summary_data = json.load(f)
except: 
    summary_data = []
    
for f in dir_list: 
    data = read_transcript(f)
    title = extract_title(f)
    
    if title in skip_list: 
        pass
    
    else: 
        texts = text_splitter.split_text(data)
        docs = [Document(page_content=t) for t in texts]

        print(f"Summarising '{title}'...")

        output_summary = chain.run(input_documents=docs, title=title)

        # Write into JSON file 
        summary_dict = {
            "title": title, 
            "summary": output_summary
        }

        print(summary_dict)
        print("\n")

        summary_data.append(summary_dict)

        with open(summary_filepath, 'w', encoding ='utf8') as f:
            json.dump(summary_data, f, allow_nan=False)

        torch.cuda.empty_cache()
        gc.collect() 

Summarising 'Saving Hacks In Singapore'...
{'title': 'Saving Hacks In Singapore', 'summary': ' \n\nThis episode of The Financial Coconuts focuses on ways to save money in Singapore by opening a new bank account and setting it as a savings account, creating an automatic transfer from your paycheck account, and using Maybank for your savings account. The speaker also talks about sharing your savings plan with others and creating an accountability system through the use of a bank card.'}


Summarising 'What’s The Best Way To Budget Your Salary'...


Token indices sequence length is longer than the specified maximum sequence length for this model (1080 > 1024). Running this sequence through the model will result in indexing errors


{'title': 'What’s The Best Way To Budget Your Salary', 'summary': '\n\n*   The speaker started their YouTube channel "Lisa\'s Adulting in Singapore" to document their journey with adulting and share tips on managing personal finance, housing, and more.\n*   Their content focuses on the low buy project, where they document every purchase made during the year.\n*   The speaker mentions that they made a video on how they bought a resale HDB at 24 years old as a single.\n*   They talk about the importance of context in understanding financial situations.\n*   The speaker shares their experiences of putting themselves out there and receiving feedback on their financial practices through their podcast "The Financial Coconuts".\n*   They mention that the financial community in Singapore is diverse and includes both traditional and alternative approaches to managing finances.'}


Summarising 'The Value Of University'...


Token indices sequence length is longer than the specified maximum sequence length for this model (1149 > 1024). Running this sequence through the model will result in indexing errors


{'title': 'The Value Of University', 'summary': ' In this episode of The Financial Coconuts, the speaker discusses the value of university education and how it can benefit individuals and society. They mention the importance of alumni involvement in supporting universities and schools, including volunteering time and resources. The speaker also mentions several examples of successful individuals who did not attend traditional universities, including themselves (founder of KinoBee) and others who started their careers outside of academia.'}


Summarising 'How To Have A Healthy Relationship With Money'...
{'title': 'How To Have A Healthy Relationship With Money', 'summary': 'In this episode of The Financial Coconuts, the speaker talks about how to cultivate a healthy relationship with money through financial journaling, fasting, and sharing knowledge with others.'}


Summarising 'How To Create A Better Budget'...
{'title': 'How To Create A Better Budget', 'summary': ' In this episode of 

Token indices sequence length is longer than the specified maximum sequence length for this model (1107 > 1024). Running this sequence through the model will result in indexing errors


{'title': 'Women & Money', 'summary': ' \n\nThis episode of The Financial Coconuts focuses on the topic of women and money. Hosts Jeanette, Sarah, and Evelyn share their personal experiences and insights related to financial planning and wealth management. They discuss the challenges faced by women in managing their finances and relationships, including the impact of societal expectations and gender roles. They also touch upon the importance of finding support and guidance in managing finances and making decisions about money.'}


Summarising 'Have Fun In Singapore On A Budget'...
{'title': 'Have Fun In Singapore On A Budget', 'summary': ' \n\nThis episode of The Financial Coconuts focuses on how to have fun on a budget in Singapore. Host Michael Yip discusses ways to save money while still enjoying life, including joining interest groups, hosting potluck parties, and making use of public spaces.'}


Summarising 'Maximize Your Budget Travels With These Tips'...
{'title': 'Maximize Your