In [1]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader
loader=TextLoader("speech.txt")
text_documents=loader.load()
text_documents

[Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We see no indemnities for ourselves, no material compensation for the sacrifices we shall freely make.')]

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# Web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

## Load, chunk, and index the content of the html page

loader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post_content","post_header")
                     )),)

text_documents=loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
text_documents

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n      LLM Powered Autonomous Agents\n    ')]

In [5]:
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader('attention.pdf')
docs=loader.load()

In [6]:
docs

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-06-27T11:07:02+00:00', 'title': 'ST 4035 - Group project proposal - Group G', 'moddate': '2025-06-27T11:07:01+00:00', 'keywords': 'DAGKon8PbcA,BAEqMF8krfE,0', 'author': 'Umayangana Ganeshi', 'source': 'attention.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="PROJECT PROPOSAL Prepared by : Group G\nUnderstanding and predicting happiness levels among residents of Somerville, New Jersey, is essential\nfor enhancing community well-being. Happiness is a fundamental indicator of life satisfaction and\nreflects the quality of life in a city. This study aims to develop and implement predictive models that\ncan accurately predict happiness levels. By leveraging machine learning techniques, this study seeks to\nidentify key predictors of happiness and create predictive models. Additionally, developing a web-\nbased dashboard or mobile solution will facilitate the presentation and accessibilit

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(docs)
documents[:5]

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-06-27T11:07:02+00:00', 'title': 'ST 4035 - Group project proposal - Group G', 'moddate': '2025-06-27T11:07:01+00:00', 'keywords': 'DAGKon8PbcA,BAEqMF8krfE,0', 'author': 'Umayangana Ganeshi', 'source': 'attention.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='PROJECT PROPOSAL Prepared by : Group G\nUnderstanding and predicting happiness levels among residents of Somerville, New Jersey, is essential\nfor enhancing community well-being. Happiness is a fundamental indicator of life satisfaction and\nreflects the quality of life in a city. This study aims to develop and implement predictive models that\ncan accurately predict happiness levels. By leveraging machine learning techniques, this study seeks to\nidentify key predictors of happiness and create predictive models. Additionally, developing a web-\nbased dashboard or mobile solution will facilitate the presentation and accessibilit

In [8]:
documents

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-06-27T11:07:02+00:00', 'title': 'ST 4035 - Group project proposal - Group G', 'moddate': '2025-06-27T11:07:01+00:00', 'keywords': 'DAGKon8PbcA,BAEqMF8krfE,0', 'author': 'Umayangana Ganeshi', 'source': 'attention.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='PROJECT PROPOSAL Prepared by : Group G\nUnderstanding and predicting happiness levels among residents of Somerville, New Jersey, is essential\nfor enhancing community well-being. Happiness is a fundamental indicator of life satisfaction and\nreflects the quality of life in a city. This study aims to develop and implement predictive models that\ncan accurately predict happiness levels. By leveraging machine learning techniques, this study seeks to\nidentify key predictors of happiness and create predictive models. Additionally, developing a web-\nbased dashboard or mobile solution will facilitate the presentation and accessibilit

In [9]:
## Vector embedding and vector store

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
db1=Chroma.from_documents(documents,OllamaEmbeddings())

# Please close other applications if a memory error occurs.

  db1=Chroma.from_documents(documents,OllamaEmbeddings())


In [10]:
## Vector database
query="Implement machine learning models suitable for multiclass classification"
result=db1.similarity_search(query)
result[0].page_content

'DATA\nThe dataset contains Somerville Happiness Survey results from 2011 - 2023. It consists of 11191\nobservations. The survey asks residents of the city to assess their personal happiness, wellbeing,\nand satisfaction with city services. The dependent variable includes the levels of happiness on a 5 -\npoint scale.\nSUGGESTED METHODOLOGY\nConduct a comprehensive descriptive analysis, identifying patterns and potentially significant variables\nby using suitable figures and tables.\nImplement machine learning models suitable for multiclass classification, such as Multinomial Logistic\nRegression, KNN, Naive Bayes, Decision Trees, Random Forest, SVM and XGboost classifiers. Use\ncross validation to fine tune parameters.\n15747 - Amila Poorna\n15665 - Tashini Ramindi\n15642 - Hasini Nimesha\n15669 - Ganeshi Umayangana\nMACHINE LEARNING INSIGHTS FOR\nSOMERVILLE HAPPINESS LEVEL\nPREDICTION\nDataset:\nBACKGROUND OF THE PROJECT\nSomerville Happiness Survey Responses\nData dictionary:'

In [11]:
## FAISS vector database
from langchain_community.vectorstores import FAISS
db2=FAISS.from_documents(documents,OllamaEmbeddings())

In [12]:
query="Implement machine learning models suitable for multiclass classification"
result=db2.similarity_search(query)
result[0].page_content

'DATA\nThe dataset contains Somerville Happiness Survey results from 2011 - 2023. It consists of 11191\nobservations. The survey asks residents of the city to assess their personal happiness, wellbeing,\nand satisfaction with city services. The dependent variable includes the levels of happiness on a 5 -\npoint scale.\nSUGGESTED METHODOLOGY\nConduct a comprehensive descriptive analysis, identifying patterns and potentially significant variables\nby using suitable figures and tables.\nImplement machine learning models suitable for multiclass classification, such as Multinomial Logistic\nRegression, KNN, Naive Bayes, Decision Trees, Random Forest, SVM and XGboost classifiers. Use\ncross validation to fine tune parameters.\n15747 - Amila Poorna\n15665 - Tashini Ramindi\n15642 - Hasini Nimesha\n15669 - Ganeshi Umayangana\nMACHINE LEARNING INSIGHTS FOR\nSOMERVILLE HAPPINESS LEVEL\nPREDICTION\nDataset:\nBACKGROUND OF THE PROJECT\nSomerville Happiness Survey Responses\nData dictionary:'

In [13]:
## Lance vector database
from langchain_community.vectorstores import LanceDB
db3=LanceDB.from_documents(documents,OllamaEmbeddings())

In [14]:
query="Implement machine learning models suitable for multiclass classification"
result=db3.similarity_search(query)
result[0].page_content

'DATA\nThe dataset contains Somerville Happiness Survey results from 2011 - 2023. It consists of 11191\nobservations. The survey asks residents of the city to assess their personal happiness, wellbeing,\nand satisfaction with city services. The dependent variable includes the levels of happiness on a 5 -\npoint scale.\nSUGGESTED METHODOLOGY\nConduct a comprehensive descriptive analysis, identifying patterns and potentially significant variables\nby using suitable figures and tables.\nImplement machine learning models suitable for multiclass classification, such as Multinomial Logistic\nRegression, KNN, Naive Bayes, Decision Trees, Random Forest, SVM and XGboost classifiers. Use\ncross validation to fine tune parameters.\n15747 - Amila Poorna\n15665 - Tashini Ramindi\n15642 - Hasini Nimesha\n15669 - Ganeshi Umayangana\nMACHINE LEARNING INSIGHTS FOR\nSOMERVILLE HAPPINESS LEVEL\nPREDICTION\nDataset:\nBACKGROUND OF THE PROJECT\nSomerville Happiness Survey Responses\nData dictionary:'