In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Load environment variables
load_dotenv()

# Set environment variables
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

# Load web content
loader = WebBaseLoader(web_paths=["https://eecs481.org/hw6.html"])
documents = loader.load()

# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits = text_splitter.split_documents(documents)

# Create embeddings and vector store
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(splits, embeddings)

# Create a retrieval chain
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)

# Function to ask questions
def ask_question(question):
    response = qa_chain.invoke({"query": question})
    return response["result"]

USER_AGENT environment variable not set, consider setting it to identify your requests.
  warn_deprecated(


In [3]:
# Example usage
question = "What do I have to do in HW6A? List down all the tasks and their headings"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What do I have to do in HW6A? List down all the tasks and their headings
Answer: In HW6A, you are required to select a project and task(s) to contribute to an open-source GitHub project. The tasks and headings for HW6A are as follows:

1. Initial Report:
   - Project Description: A brief description of the open-source project you have selected to contribute to.
   - Task Selection: Description of the task(s) you have chosen to work on within the project.
   - Proposed Schedule: An estimated schedule with effort estimates for completing the selected task(s).

2. Task Planning:
   - Task Scope Justification: Evidence that the tasks are of sufficient size and complexity for this assignment.

3. Performing the Task:
   - Task Link(s): Evidence that the task(s) are requested by the community, with a screenshot or issue tracking link.
   - Requirements: Description of each task's requirements, both functional and quality.

Remember to read the entire homework description for detail

In [6]:
# Example usage
question = "What do I have to do in HW6b — Project Report?"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What do I have to do in HW6b — Project Report?
Answer: In HW6b — Project Report, you are required to submit a PDF report that includes the following components:

1. Names and email ids: Include your name and UM email id. If you have a partner, include their name and email id as well.
2. Selected project: Provide a brief description of the open-source system you contributed to.
3. Project context: Analyze the open-source project's context and "business model." This may include a short history of the project, competing open- and closed-source projects, or a discussion of the developers' motivations to build the system.
4. Task description (per task): Describe the tasks you implemented and how you implemented them.
5. Advice for future students: Provide a single sentence of advice for future students taking the class.
6. Reflection: Reflect on your experience contributing to the open-source project, discussing what worked, what did not work, and what you would do differently.
7.

In [45]:
# Example usage
question = "Suggest me on how I can do this project?"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Suggest me on how I can do this project?
Answer: Based on the context provided, here are some steps you can follow to work on this project effectively:

1. **Research and Select a Project**: Start by researching open-source projects and select one that interests you. Consider factors like the type of software, project age, active contributors, communication among contributors, etc.

2. **Understand the Project**: Read the documentation, build and execute the source code to understand how your modification fits into the project. Ensure that your task is non-trivial but doable within the available resources.

3. **Select a Task**: Choose a task that aligns with the project's needs and your skills. Consider both functional and non-functional requirements and how the task fits into the project structure.

4. **Contribute to the Project**: Make your changes to the project and create any necessary documentation for acceptance. Submit your changes through mechanisms like pull reques

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

# Set environment variables
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [2]:
# # Data Ingestion
# from langchain_community.document_loaders import TextLoader
# loader = TextLoader("Speech.txt")
# text_documents = loader.load()

In [13]:
from langchain_community.document_loaders import WebBaseLoader
import bs4

# load, chunk and index the content for the html page
loader = WebBaseLoader(web_paths=["https://eecs481.org/hw6.html"])
web_documents = loader.load()

In [14]:
web_documents

[Document(metadata={'source': 'https://eecs481.org/hw6.html', 'title': 'EECS 481 — Software Engineering', 'language': 'en'}, page_content='\n\n\n\n\n\nEECS 481 — Software Engineering\n\n\n\n\n\n\n\n\n\nToggle navigation\n\n\n\n\nSoftware Engineering\n\n\n\n\nLectures\n\nAssignments\n              \n\nHW0 — Dev Setup\nHW1 — Test Coverage\nHW2 — Test Automation\nHW3 — Mutation Testing\nHW4 — Defect Detection\nHW5 — Debugging Automation\nHW6 — Contribution\n\nHomework Submission\n\n\nExams\nAdvice\n\n\nSwitch Color Theme\n\n\n\n\n\n\n\nHomework Assignment #6 — Contribution\n\n        In this assignment you will contribute to an open source GitHub project.\n      \n\n        Your high-level goal is to produce and submit a non-trivial modification or\n        extension to an open-source project in a way that maximizes the chances\n        that the project maintainers accept it. If you demonstrate to us that your\n        change has been accepted and integrated into the project’s code base, 

In [20]:
# pdf
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("attention.pdf")
pdf_documents = loader.load()

In [15]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# documents = text_splitter.split_documents(pdf_documents)

In [23]:
documents[:5]

[Document(metadata={'source': 'attention.pdf', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\n

In [16]:
# Vector embbedings and Vector store
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db_chroma = Chroma.from_documents(web_documents, OpenAIEmbeddings())

  warn_deprecated(


In [17]:
query = "WWhat is this project about?"
result = db_chroma.similarity_search(query)
result[0].page_content

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


'\n\n\n\n\n\nEECS 481 — Software Engineering\n\n\n\n\n\n\n\n\n\nToggle navigation\n\n\n\n\nSoftware Engineering\n\n\n\n\nLectures\n\nAssignments\n              \n\nHW0 — Dev Setup\nHW1 — Test Coverage\nHW2 — Test Automation\nHW3 — Mutation Testing\nHW4 — Defect Detection\nHW5 — Debugging Automation\nHW6 — Contribution\n\nHomework Submission\n\n\nExams\nAdvice\n\n\nSwitch Color Theme\n\n\n\n\n\n\n\nHomework Assignment #6 — Contribution\n\n        In this assignment you will contribute to an open source GitHub project.\n      \n\n        Your high-level goal is to produce and submit a non-trivial modification or\n        extension to an open-source project in a way that maximizes the chances\n        that the project maintainers accept it. If you demonstrate to us that your\n        change has been accepted and integrated into the project’s code base, you\n        will receive +6% bonus points on HW6b. You will select an open source\n        project, select a change to implement, actuall

In [38]:
## FAISS Vector Database
from langchain_community.vectorstores import FAISS
db_faiss = FAISS.from_documents(documents[:15], OpenAIEmbeddings())

In [39]:
query = "Who are the authors of the paper Attention is All you Need research paper?"
query = "What is Attention is all you Need?"
result = db_faiss.similarity_search(query)
result[0].page_content

'Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and con