In [23]:
import torch
print(torch.cuda.is_available()) 
print(torch.cuda.get_device_name(0))  # GPU name

True
NVIDIA GeForce GTX 1650


In [24]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
import ollama
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

In [None]:
# loader = PyPDFLoader("./doc/budget-2024.pdf")
# doc = loader.load()
# type(doc)

list

In [14]:
# text_spliter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 30, separator = "\n")
# docs = text_spliter.split_documents(documents = doc)


In [25]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"   # It maps sentences & paragraphs to a 384 dimensional dense vector space
model_kwargs = {'device': 'cuda'} # it will use googl cloud gpu..
embeddings = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs=model_kwargs) #model_kwargs=model_kwargs

In [None]:
#vectorstore = FAISS.from_documents(docs, embeddings)

In [None]:
#vectorstore.save_local("faiss_index")

In [26]:
#loading embedding...
embedding_vectorestore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [27]:
retriever = embedding_vectorestore.as_retriever(search_type="similarity")

curl -fsSL https://ollama.ai/install.sh | sh

ollama serve & ollama pull llama3., llama3.2:3b

In [31]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="llama3.2:3b")

In [32]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [34]:
result = qa.invoke('How much new financing is being unlocked to build more rental apartments per year?')
print(result)

{'query': 'How much new financing is being unlocked to build more rental apartments per year?', 'result': 'The amount of new financing being unlocked to build more rental apartments per year is $20 billion. This will increase the annual limit for Canada Mortgage Bonds from $40 billion to up to $60 billion, allowing for 30,000 more rental apartments to be built per year over the next three years and more than 750,000 homes across Canada over the next decade.'}


# Evaluation

In [46]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [36]:
queries = ['How much new financing is being unlocked to build more rental apartments per year?',
  'What is the goal of the Public Lands for Homes Plan?',
  'How many properties does National Defence own across Canada, and what is their total area?',
  'What is the total investment announced in Budget 2024 for the Apartment Construction Loan Program',
  'How much funding is proposed for the Canada Housing Infrastructure Fund in Budget 2024?',
  'What is the total investment commitment made by the Canada Infrastructure Bank since 2017?',
  'How much funding is proposed for the Homebuilding Technology and Innovation Fund in Budget 2024?',
  'How many workers does the construction sector in Canada face a shortage by 2032?',
  'What is the proposed funding amount in Budget 2024 for creating apprenticeship opportunities?',
  'What do young adults in Canada feel regarding the possibility of owning a home?']

references = ['$20 billion to build 30,000 more rental apartments per year.',
  'The goal is to unlock 250,000 new homes by 2031',
  'National Defence owns 622 properties totaling 2.2 million hectares across every province and territory.',
  'Budget 2024 announces an additional $15 billion for the Apartment Construction Loan Program',
  'The proposed funding is $6 billion over 10 years',
  'The total investment commitment is over $11 billion.',
  'The proposed funding is $50 million over two years',
  'The sector faces a shortage of over 60,000 workers by 2032',
  '$100 million over two years',
  'Young adults feel that owning a home is becoming less likely due to rising home prices outpacing their salaries']

In [37]:
predictions = [qa.invoke(query)["result"] for query in queries]


In [38]:
predictions

['According to the text, over the next three years, more than 750,000 homes across Canada will be built. Over the next decade, it is not specified exactly how many new rentals per year are being unlocked but the total amount of $20 billion in new financing mentioned earlier indicates that for the next three years this amount was available and for the next decade a similar funding increase ($40-60 billion) is proposed to bring up to 30,000 more rental apartments per year.',
 'The goal of the Public Lands for Homes Plan is to unlock 250,000 new homes by 2031.',
 'According to the context provided, National Defence owns 622 properties across every province and territory in Canada. The total area of these properties is 2.2 million hectares.',
 'The total investment announced in Budget 2024 for the Apartment Construction Loan Program is $55 billion.',
 'The federal government proposes to provide $6 billion over 10 years, starting in 2024-25, to launch a new Canada Housing Infrastructure Fun

In [44]:
#Note: Blue score won't give me correct answer....
# smooth = SmoothingFunction().method4
# scores = [sentence_bleu([ref], pred.split(), smoothing_function=smooth) for ref, pred in zip(references, predictions)]
# print("BLEU Scores:", scores)


```
ROUGE-1: Measures unigram (single word) overlap between the reference and prediction.
ROUGE-2: Measures bigram (two consecutive words) overlap. This is more strict than ROUGE-1, as it evaluates phrase-level similarity.
ROUGE-L: Measures the longest common subsequence (LCS).
```

In [47]:
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]

# Display the scores
for i, score in enumerate(rouge_scores):
    print(f"Reference {i+1} vs Prediction {i+1}:")
    print(f"ROUGE-1: {score['rouge1'].fmeasure:.4f}, "
          f"ROUGE-2: {score['rouge2'].fmeasure:.4f}, "
          f"ROUGE-L: {score['rougeL'].fmeasure:.4f}")
    print()

Reference 1 vs Prediction 1:
ROUGE-1: 0.2083, ROUGE-2: 0.1489, ROUGE-L: 0.2083

Reference 2 vs Prediction 2:
ROUGE-1: 0.7586, ROUGE-2: 0.6667, ROUGE-L: 0.7586

Reference 3 vs Prediction 3:
ROUGE-1: 0.6977, ROUGE-2: 0.5366, ROUGE-L: 0.4651

Reference 4 vs Prediction 4:
ROUGE-1: 0.6897, ROUGE-2: 0.4444, ROUGE-L: 0.5517

Reference 5 vs Prediction 5:
ROUGE-1: 0.5000, ROUGE-2: 0.2667, ROUGE-L: 0.4375

Reference 6 vs Prediction 6:
ROUGE-1: 0.3750, ROUGE-2: 0.2000, ROUGE-L: 0.3125

Reference 7 vs Prediction 7:
ROUGE-1: 0.5000, ROUGE-2: 0.2353, ROUGE-L: 0.2778

Reference 8 vs Prediction 8:
ROUGE-1: 0.8889, ROUGE-2: 0.7200, ROUGE-L: 0.8889

Reference 9 vs Prediction 9:
ROUGE-1: 0.3077, ROUGE-2: 0.2500, ROUGE-L: 0.3077

Reference 10 vs Prediction 10:
ROUGE-1: 0.4789, ROUGE-2: 0.2899, ROUGE-L: 0.4507



In [45]:
import torch
torch.cuda.empty_cache()
