In [1]:
import re
import os
import numpy as np
from sklearn.cluster import KMeans
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS

In [2]:
os.environ["OPENAI_API_KEY"] =  "sk- *************************"
llm = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0.4
)
embeddings = OpenAIEmbeddings()

In [3]:
def clean_sentence(sentence):
    match = re.search(r"[a-zA-Z0-9]", sentence)
    if match:
        start_index = match.start()
        cleaned_sentence = sentence[start_index:]
    else:
        cleaned_sentence = ""
    cleaned_sentence = cleaned_sentence.replace("\n", " ")
    cleaned_sentence = cleaned_sentence.strip()
    cleaned_sentence = re.sub(r"[@]+", "@", cleaned_sentence)
    cleaned_sentence = re.sub(r"[.]+", ".", cleaned_sentence)
    cleaned_sentence = re.sub(r"[']+", "'", cleaned_sentence)
    cleaned_sentence = re.sub(r"[^a-z.A-Z@0-9\s'?:,àéêûî]", "", cleaned_sentence)
    cleaned_sentence = re.sub(r"\s{2,}", " ", cleaned_sentence)
    return cleaned_sentence

In [4]:
file = "crime-and-punishment.pdf"
loader = PyPDFLoader(file)
pages = loader.load()

In [5]:
text = ""

for page in pages:
    text += page.page_content
    
text = clean_sentence(text)

In [65]:
num_tokens = llm.get_num_tokens(text)

print ("Total Tokens:",num_tokens)

Total Tokens: 302371


In [8]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=15000, chunk_overlap=1000)

docs = text_splitter.create_documents([text])

In [66]:
num_documents = len(docs)

print (f"Splitted Documents {num_documents}")

Splitted Documents 84


In [10]:
embeddings = OpenAIEmbeddings()

vectors = embeddings.embed_documents([x.page_content for x in docs])

In [24]:
num_clusters = 50
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)

In [25]:
kmeans.labels_

array([26,  2, 42,  9,  3, 30, 30,  8, 36, 20, 37,  7,  7, 14, 14, 23, 31,
       27, 22, 22, 21, 34, 27,  4,  4,  4, 35, 12, 43, 29,  0,  0,  0, 28,
        0, 25, 25, 25, 11,  5,  5, 47, 47,  6,  6,  6, 10, 10, 45, 13, 13,
       38, 11, 11, 11, 17, 17, 33, 19, 19,  1,  1,  1, 15, 15, 32, 32, 27,
       11, 24, 24, 44, 44, 46, 16, 40, 18, 39, 41, 49, 36,  3, 48, 48],
      dtype=int32)

In [27]:
closest_chunks = []
for i in range(num_clusters):
    distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
    closest_index = np.argmin(distances)
    closest_chunks.append(closest_index)

In [29]:
selected_chunks = sorted(closest_chunks)
selected_chunks

[0,
 1,
 2,
 3,
 4,
 5,
 7,
 8,
 9,
 10,
 12,
 13,
 15,
 16,
 17,
 18,
 20,
 21,
 24,
 26,
 27,
 28,
 29,
 31,
 33,
 36,
 39,
 41,
 44,
 46,
 48,
 49,
 51,
 53,
 55,
 57,
 58,
 61,
 63,
 65,
 69,
 71,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 82]

In [30]:
map_prompt = """
You will be given a single passage of book.The passage will be enclosed in triple backticks (```)
Your goal is to give a summary of the passage in a way that reader will have a full understanding of what happened.
Your response should be at least four paragraphs and fully encompass what was said in the passage.
```{text}```
FULL SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [31]:
Summarize_Chain = load_summarize_chain(llm=llm,
                             chain_type="stuff",
                             prompt=map_prompt_template)

In [32]:
selected_docs = [docs[doc] for doc in selected_chunks]

In [33]:
summary = []
for i, doc in enumerate(selected_docs):
    # Go get a summary of the chunk
    chunk_summary = Summarize_Chain.run([doc])
    summary.append(chunk_summary)
    print (f"Summary {i} (chunk No.{selected_chunks[i]}) - {chunk_summary} \n")

Summary 0 (chunk No.0) - The passage introduces the background of Fyodor Dostoevsky, detailing his upbringing, education, and the hardships he faced, including being sentenced to death and later to hard labor. It also mentions his struggles with illness, poverty, and loss throughout his life. Despite these challenges, Dostoevsky's resilience and religious beliefs shaped his perspective on suffering, which is reflected in his writings. The passage highlights his profound insights and wisdom, as well as his impact on Russian literature.

The passage then shifts to a scene from "Crime and Punishment," focusing on the main character, a young man named Raskolnikov, who is portrayed as impoverished, isolated, and mentally unstable. The narrative delves into Raskolnikov's inner turmoil, fear of meeting his landlady, and his contemplation of committing a crime. His erratic thoughts and behavior showcase his deteriorating mental state and growing desperation. The vivid descriptions of the setti

In [68]:
summaries = "\n".join(summary)
summaries = Document(page_content=summaries)
print (f"Total Summary tokens {llm.get_num_tokens(summaries.page_content)} ")

Total Summary tokens 17804 


In [37]:
summaries



In [71]:
llm3 = ChatOpenAI(temperature=0.5,
                #  max_tokens=3000,
                 model='gpt-4-32k',
                 request_timeout=120
                )

In [73]:
combine_prompt = """
You will be given a series of summaries from a book. The summaries will be enclosed in triple backticks (```)
Your goal is to give a verbose summary of what happened in the story.
The reader should be able to grasp what happened in the book.

```{text}```
SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [74]:
reduce_chain = load_summarize_chain(llm=llm3,
                             chain_type="stuff",
                             prompt=combine_prompt_template,
                                   )

In [75]:
output = reduce_chain.run([summaries])

In [76]:
print (output)

The book begins by introducing the life of Fyodor Dostoevsky, a renowned Russian author who faced numerous hardships in his life. Despite these challenges, Dostoevsky's resilience and religious beliefs shaped his perspective on suffering, which is reflected in his writings. The narrative then shifts to one of Dostoevsky's most famous works, "Crime and Punishment," focusing on the protagonist, Raskolnikov. Raskolnikov is portrayed as a young man grappling with poverty, isolation, and mental instability. The narrative delves into Raskolnikov's internal struggles, his fear of meeting his landlady, and his contemplation of committing a crime. His erratic thoughts and behavior showcase his deteriorating mental state and growing desperation.

Throughout the story, Raskolnikov interacts with various characters, each with their own struggles and hardships. One such character is Marmeladov, a retired clerk struggling with poverty and alcoholism. Despite Marmeladov's drunken state, he speaks elo

In [78]:
from fpdf import FPDF
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Crime and Punishment Summary', 0, 1, 'C')

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

pdf = PDF()
pdf.add_page()
pdf.set_left_margin(10)
pdf.set_right_margin(10)
pdf.set_auto_page_break(auto=True, margin=15)
pdf.chapter_body(output)
pdf.output("Book Summary.pdf")


''