diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..946913d39 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +__pycache__/* +.idea +.idea/* +node_modules +*.docx +*.DS_Store +*.iml +*.log +*.csv +*.pyc +*/subtitles.json +docs/ diff --git a/ChatGPTQuerier/chat_coursera.py b/ChatGPTQuerier/chat_coursera.py new file mode 100644 index 000000000..db4a2c5bd --- /dev/null +++ b/ChatGPTQuerier/chat_coursera.py @@ -0,0 +1,44 @@ +import openai +import os +from langchain.chains import RetrievalQA +from langchain.chat_models import ChatOpenAI +from langchain.document_loaders import JSONLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores.chroma import Chroma + +from dotenv import load_dotenv, find_dotenv + + +_ = load_dotenv(find_dotenv()) # read local .env file +loader = JSONLoader( + file_path='./chat_subtitles.json', + jq_schema='.filler[].text', + text_content=False) + +docs = loader.load() +r_splitter = RecursiveCharacterTextSplitter( + chunk_size=150, + chunk_overlap=0, + separators=["\n\n", "\n", "\. ", " ", ""] +) +trans_docs = r_splitter.split_documents(docs) + +# print(trans_docs) + +persist_directory = 'docs/chroma/' +embedding = OpenAIEmbeddings() +vectordb = Chroma( + persist_directory=persist_directory, + embedding_function=embedding +) +vectordb.add_documents(docs) + + +llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0) + +qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever()) +while True: + question = input() + result = qa_chain({"query": question}) + print(result["result"]) \ No newline at end of file diff --git a/ChatGPTQuerier/chat_subtitles.json b/ChatGPTQuerier/chat_subtitles.json new file mode 100644 index 000000000..6d9270eeb --- /dev/null +++ b/ChatGPTQuerier/chat_subtitles.json @@ -0,0 +1,124 @@ +{ + "filler": [ + { + "time": "0:00", + "text": "[SOUND] Hello. Welcome to the course Text Mining and Analytics. My name is ChengXiang Zhai. I have a nickname, Cheng. I am a professor of the Department of Computer Science at the University of Illinois at Urbana-Champaign. This course is a part of a data mining specialization offered by the University of Illinois at Urbana-Champaign. In addition to this course, there are four other courses offered by", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "0:39", + "text": "Professor Jiawei Han, Professor John Hart and me, followed by a capstone project course that all of us will teach together.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "0:51", + "text": "This course is particularly related to another course in the specialization, mainly text retrieval and search engines in that both courses are about text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:07", + "text": "In contrast, pattern discovery and cluster analysis are about algorithms more applicable to all kinds of data in general. The visualization course is also relatively general in that the techniques can be applied to all kinds of data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:28", + "text": "This course addresses a pressing need for harnessing big text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:35", + "text": "Text data has been growing dramatically recently, mostly because of the advance of technologies deployed on the web that would enable people to quickly generate text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:50", + "text": "So, I listed some of the examples on this slide", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:57", + "text": "that can show a variety of text data that are available today. For example, if you think about the data on the internet, on the web,", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "2:07", + "text": "everyday we are seeing many web pages being created.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "2:13", + "text": "Blogs are another kind of new text data that are being generated quickly by people. Anyone can write a blog article on the web. New articles of course have always been a main kind of text data that being generated everyday.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "2:31", + "text": "Emails are yet another kind of text data. And literature is also representing a large portion of text data. It's also especially very important because of the high quality in the data. That is, we encode our knowledge about the word using text data represented by all the literature articles. It's a vast amount of knowledge of", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:08", + "text": "all the text and data in these literature articles.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:14", + "text": "Twitter is another representative text data representing social media. Of course there are forums as well.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:24", + "text": "People are generating tweets very quickly indeed as we are speaking perhaps many people have already written many tweets. So, as you can see there are all kinds of text data that are being generated very quickly.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:38", + "text": "Now these text data present some challenges for people.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:43", + "text": "It's very hard for anyone to digest all the text data quickly. In particular, it's impossible for scientists to read all of the for example or for anyone to read all the tweets.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "4:01", + "text": "So there's a need for tools to help people digest text data more efficiently.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "4:09", + "text": "There is also another interesting opportunity provided by such big text data, and that is it's possible to leverage the amount of text data to discover interesting patterns to turn text data into actionable knowledge that can be useful for decision making.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "4:27", + "text": "So for example, product managers may be interested in knowing the feedback of customers about their products, knowing how well their products are being received as compared with the products of competitors. This can be a good opportunity for leveraging text data as we have seen a lot of reviews of product on the web. So if we can develop a master text mining techniques to tap into such a [INAUDIBLE] to extract the knowledge and opinions of people about these products, then we can help these product managers to gain business intelligence or to essentially feedback from their customers.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "5:18", + "text": "In scientific research, for example, scientists are interested in knowing the trends of research topics, knowing", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "5:29", + "text": "about what related fields have discovered. This problem is especially important in biology research as well. Different communities tend to use different terminologies, yet they're starting very similar problems. So how can we integrate the knowledge that is covered in different communities to help study a particular problem? It's very important, and it can speed up scientific discovery.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "5:57", + "text": "So there are many such examples where we can leverage the text data to discover useable knowledge to optimize our decision.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "6:06", + "text": "The main techniques for harnessing big text data are text retrieval and text mining. So these are two very much related technologies.Yet, they have somewhat different purposes. These two kinds of techniques are covered in the tool in this specialization. So, text retrieval on search engines covers text retrieval, and this is necessary to turn big text data into a much smaller but more relevant text data, which are often the data that we need to handle a particular problem or to optimize a particular decision. This course covers text mining which is a second step in this pipeline that can be used to further process the small amount of relevant data to extract the knowledge or to help people digest the text data easily. So the two courses are clearly related, in fact, some of the techniques are shared by both text retrieval and text mining. If you have already taken the text retrieval course, then you might see some of the content being repeated in this text mining course, although we'll be talking about the techniques from a very different perspective. If you have not taken the text retrieval course, it's also fine because this course is self-contained and you can certainly understand all of the materials without a problem. Of course, you might find it beneficial to take both courses and that will give you a very complete set of skills to handle big text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "8:02", + "text": "[MUSIC]", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + } + ] +} \ No newline at end of file diff --git a/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png b/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png new file mode 100644 index 000000000..335727bbb Binary files /dev/null and b/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png differ diff --git a/ChromeExtension/index.html b/ChromeExtension/index.html new file mode 100644 index 000000000..3b6fddad0 --- /dev/null +++ b/ChromeExtension/index.html @@ -0,0 +1,24 @@ + + + + + + + + Search Coursera Lectures + + +
+

Coursera Transcript Search

+
+ +
+ + + + + + diff --git a/ChromeExtension/js/search.js b/ChromeExtension/js/search.js new file mode 100644 index 000000000..abbe2cd43 --- /dev/null +++ b/ChromeExtension/js/search.js @@ -0,0 +1,147 @@ +const search_btn = document.getElementById("submit-button"); +const result_container = document.querySelector('#result-container-transcript') + +search_btn.addEventListener('click', function () { + if (result_container.childElementCount > 0) { + remove_all_children(result_container) + } + + search_api() +}); + +async function search_api() { + + var headers = new Headers(); + headers.append("Content-Type", "application/json"); + headers.append("Authorization", "Basic ZWxhc3RpYzpwY2lXY2xwTE5kWHVpY1VoWFY4YmhnazI="); + + const query_txt = document.getElementById("searchbox").value + // Query string to send to elasticSearch + const query_payload = { + size: 5, + from: 0, + query: { + "query_string": { + "query": query_txt + } + } + } + var requestOptions = { + method: 'POST', + headers: headers, + body: JSON.stringify(query_payload) + }; + + // Calling ES _search API to retrieve results from "subtitles" API + const response = await fetch("https://ac55987c83844faa90726d4e5efe92b9.us-central1.gcp.cloud.es.io/subtitles/_search", requestOptions) + const record = await response.json() + if(record.hits.total.value > 0) { + const result_num = Math.min(record.hits.total.value, 5) + for (let i = 0; i < result_num; i++) { + const result = record.hits.hits[i]._source + const result_dict = {} + const response_str = ''+ result.week + '
' + + ' Title :: ' + result.lecture_title + '
' + + ' timestamp :: ' + result.time + '
' + + ' Subtitles : '+result.text + + '
' + console.log("Resoponse :: ", response_str) + result_dict["week"] = "Week " + result.week.slice(-1) + result_dict["lecture_title"] = result.lecture_title + result_dict["url"] = result.url + result_dict["time"] = result.time + result_dict["subtitles"] = result.text + result_dict["course_name"] = result.course_name + set_result_format(result_dict) + } + } else { + const result_div = document.createElement('div') + result_div.innerHTML = "We could not find a related topic" + result_container.appendChild(result_div) + } + +} + +function set_result_format(result_dict) { + + // Initiate html components + const result_item = document.createElement('div') + const result_first_row = document.createElement('div') + const result_second_row = document.createElement('div') + const result_url = document.createElement('a') + const result_week = document.createElement('h4') + const result_course_name = document.createElement('h4') + const result_time = document.createElement('h4') + const result_lecture_title = document.createElement('h4') + const result_subtitles = document.createElement('p') + + // Set up class/ id for some components + result_item.classList.add("result__item") + result_first_row.classList.add("result__first--row") + result_second_row.classList.add("result__second--row") + result_course_name.classList.add("result__course--name") + result_time.classList.add("timestamp") + result_url.classList.add("lecture__url") + + // Set the content of components + result_url.href = result_dict["url"] + result_week.innerHTML = result_dict["week"] + result_course_name.innerHTML = result_dict["course_name"] + time_reformat = format_time(result_dict["time"]) + result_time.innerHTML = time_reformat + result_lecture_title.innerHTML = result_dict["lecture_title"] + result_subtitles.innerHTML = result_dict["subtitles"] + + // Organize html component structure + result_item.appendChild(result_url) + result_item.appendChild(result_first_row) + result_first_row.append(result_week) + result_first_row.append(result_course_name) + result_item.appendChild(result_second_row) + result_second_row.appendChild(result_time) + result_second_row.appendChild(result_lecture_title) + result_item.appendChild(result_subtitles) + + result_container.appendChild(result_item) +} + +function format_time(time) { + let parts = time.split(':').map(part => parseInt(part, 10)); + let seconds = parts[0]; + let minutes = parts[1]; + let hours = parts.length > 2 ? parts[2] : 0; + + // Make sure each part has two digits + hours = hours.toString().padStart(2, '0'); + minutes = minutes.toString().padStart(2, '0'); + seconds = seconds.toString().padStart(2, '0'); + + return `${hours}:${minutes}:${seconds}`; +} + +function remove_all_children(element) { + while (element.firstChild) { + element.removeChild(element.firstChild); + } +} + +document.addEventListener('DOMContentLoaded', function () { + const parent = document.querySelector('.result__container--transcript'); + + parent.addEventListener('click', function (event) { + // Check if the clicked element or its parent has the class 'container' + let container = event.target.classList.contains('result__item') + ? event.target + : event.target.closest('.result__item'); + + if (container) { + // Extract the URL from the child anchor tag + let url = container.querySelector('.lecture__url').getAttribute('href'); + + // Open the URL + if (url) { + chrome.tabs.create({ url: url }); + } + } + }); +}); diff --git a/ChromeExtension/manifest.json b/ChromeExtension/manifest.json new file mode 100644 index 000000000..34e65f23c --- /dev/null +++ b/ChromeExtension/manifest.json @@ -0,0 +1,16 @@ +{ + "name": "CS410_Fall2023_CourseProject_TeamCAHJ", + "description": "Base Level Extension", + "version": "1.0", + "permissions": [ + "storage", + "tabs" + ], + "host_permissions": ["http://*/*", "https://*/*"], + "manifest_version": 3, + "action": { + "default_popup": "index.html", + "default_icon": "img/CS410_Fall2023_CourseProject_TeamCAHJ.png", + "default_title": "CS410_Fall2023_CourseProject_TeamCAHJ" + } +} \ No newline at end of file diff --git a/ChromeExtension/style.css b/ChromeExtension/style.css new file mode 100644 index 000000000..4ebb7279d --- /dev/null +++ b/ChromeExtension/style.css @@ -0,0 +1,123 @@ +@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap'); + +* { + box-sizing: border-box; + background-color: transparent; +} + +body { + font-family: 'Roboto', sans-serif; + align-items: center; + justify-content:center; + height: 100%; + overflow: hidden; + margin: 0px; +} + +.extension__container{ + display: flex; + flex-direction: column; + outline: 1px solid black; + height: 600px; + width: 450px; + margin: 0px; +} + +.header__course { + display: flex; + align-items: center; + color:rgba(50, 50, 50, 100); + background: white; + height: 50px; + margin: 0; + padding: 10px; + box-shadow: 12px 12px 2px 1px rgba(0, 0, 255, .2); +} + +.result__container--transcript { + flex-grow: 1; + background: rgb(245,245,245); + overflow-y: auto; + margin: 0; + padding: 15px; +} + + +.result__container--transcript .result__item:hover { + cursor: pointer; + background-color: rgb(236, 239, 243); +} + +.result__item { + display: flex; + flex-direction: column; + background: white; + box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.1), 0 3px 10px 0 rgba(0, 0, 0, 0.1); + border-radius: 8px; + margin-bottom: 15px; + padding: 10px; +} + +.result__item h4 { + line-height: 1rem; + margin: 4px; + word-wrap: break-word; + overflow: hidden; + max-height: 1.5em; +} + +.result__first--row { + display: flex; + flex-direction: row; +} + +.result__second--row { + display: flex; + flex-direction: row; +} + +.result__course--name { + padding-left: 8px; +} + +.timestamp { + color: rgb(47, 151, 242); +} + +.result__item p { + margin: 4px; + word-wrap: break-word; + line-height: 1em; + max-height: 3em; + overflow: hidden; + position: relative; +} + +.footer__input { + display: flex; + align-items: center; + height: 60px; + background: white; + box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.1), 0 6px 20px 0 rgba(0, 0, 0, 0.1); + border-top: 1px solid rgb(225, 225, 225); + margin: 0; + padding: 10px; +} + +#searchbox{ + flex-grow: 1; + margin-right: 10px; + background-color: white; + border: 2px solid grey; + border-radius: 5px; + height: 30px; +} + +#submit-button { + color: white; + background-color: rgb(96, 176, 246); + border: none; + height: 30px; + border-radius: 3px; +} + diff --git a/CourseraTranscriptScraper/CourseraScraper.py b/CourseraTranscriptScraper/CourseraScraper.py new file mode 100644 index 000000000..78c592c88 --- /dev/null +++ b/CourseraTranscriptScraper/CourseraScraper.py @@ -0,0 +1,168 @@ +import re +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException + + +class CourseraScraper: + def __init__(self, course_url: str, username: str, password: str) -> None: + self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) + self.url = course_url + self.username = username + self.password = password + self.course_transcript_for_json = {} + # Login to Coursera to allow scraper to parse pages + CourseraScraperLogin(self.driver, self.username, self.password).login() + self.driver.get(self.url) + + def run_scraper(self): + # Parse course to get list of urls for each week to scrape + course_transcripts = [] + + course_parser = CourseraCourseParser(self.driver) + self.course_name = course_parser.course_name + + # Parse each week url to get list of lecture URLs to scrape + for week_url in course_parser.week_urls: + week_str = "Week" + week_url.rsplit("/", 2)[-1] + week_parser = CourseraWeekParser(self.driver, week_url) + lecture_urls = week_parser.lecture_urls + + week_transcripts = [] + + for lecture_url in lecture_urls: + lecture_title = lecture_url.rsplit("/", 2)[-1] + lecture_subtitles = week_parser.get_lecture_subtitles(lecture_url) + week_transcripts.append({lecture_title: lecture_subtitles}) + + course_transcripts.append({week_str: week_transcripts}) + + self.course_transcript_for_json[self.course_name] = course_transcripts + + +class CourseraScraperLogin: + def __init__(self, driver: webdriver.Chrome, email: str, password: str) -> None: + self.driver = driver + self.url = "https://www.coursera.org" + self.login_email = email + self.login_password = password + + def login(self) -> None: + login_url = self.url + "/?authMode=login" + self.driver.get(login_url) + self.driver.find_element("id", "email").send_keys(self.login_email) + self.driver.find_element("id", "password").send_keys(self.login_password) + self.driver.find_element("xpath", "//button[@type='submit']").click() + input("Finalize CAPTCHA and then press Enter in the shell") + + +class CourseraCourseParser: + def __init__(self, driver: webdriver.Chrome) -> None: + self.driver = driver + self.course_name = self.parse_course_name() + self.get_week_urls() + + def parse_course_name(self) -> str: + title_xpath = "//*[@class='cds-108 cds-Typography-base css-e7lgfl cds-110']" + title_elements = self.driver.find_elements(By.XPATH, title_xpath) + title = title_elements[0].text + return title + + def get_week_urls(self) -> None: + """Initialize the URLs for each week of the course""" + self.landing_page = self.driver.current_url + # Coursera defaults to saving the user's last accessed week, so need to get the true landing + # page once it's been navigated to + self.landing_page = self.landing_page.split("week")[0] + + week_url_list = [] + if "https://www.coursera.org/learn/" in self.landing_page: + self.driver.get(self.landing_page) + week_list_xpath_pattern = "//*[@class='cds-108 css-1mxkpit cds-110']" + # Need to make sure the element loads on the page before it can be scraped + try: + _ = WebDriverWait(self.driver, 2).until( + EC.presence_of_element_located((By.XPATH, week_list_xpath_pattern)) + ) + except TimeoutException: + print("Loading took too much time!") + # Get all elements from the sidebare containing links to the course's week lectures + week_elements = self.driver.find_elements(By.XPATH, week_list_xpath_pattern) + + for week_number in range(1, len(week_elements) + 1): + week_url_list.append(self.landing_page + f"week/{week_number}") + else: + self.get_week_urls() + + self.week_urls = week_url_list + + +class CourseraWeekParser: + def __init__(self, driver: webdriver.Chrome, week_url: str) -> None: + self.driver = driver + self.week_url = week_url + self.get_lecture_urls() + + def get_lecture_urls(self): + lecture_urls = [] + soup = self.get_page_soup(self.week_url) + elements = soup.find_all("div", attrs={"data-test": "WeekSingleItemDisplay-lecture"}) + + for element in elements: + a_tag = element.find("a") + if a_tag and "href" in a_tag.attrs: + href_value = a_tag["href"] + lecture_urls.append("https://www.coursera.org" + href_value) + else: + print("href attribute not found") + self.lecture_urls = lecture_urls + + def get_lecture_subtitles(self, lecture_url): + soup = self.get_page_soup(lecture_url) + subtitles = [] + + # Find all div elements contain subtitles + pattern = re.compile(r"\bcss-1shylkf\b") + elements = soup.find_all("div", class_=pattern) + if len(elements) == 0: + print("No value retrieved") + else: + print("Retrieved") + + for element in elements: + # Extract the timestamp + button = element.find("button", class_="timestamp") + timestamp = button.contents[-1].strip() + + # Extract all phrase elements and concatenate the text of all subtitles + phrases = element.find_all("div", class_="phrases") + text_content = " ".join(phrase.get_text().strip() for phrase in phrases) + + # Append the subtitles to the list as a dictionary + subtitles.append({"time": timestamp, "text": text_content, "url": lecture_url}) + + # Process the subtitles + return subtitles + + def get_page_soup(self, url: str) -> BeautifulSoup: + # Take driver to specified URL + self.driver.get(url) + # Need to make sure the element loads on the page before it can be scraped + try: + transcript_xpath = "//*[@class='phrases']" + _ = WebDriverWait(self.driver, 2).until( + EC.presence_of_element_located((By.XPATH, transcript_xpath)) + ) + except TimeoutException: + print("Loading took too much time!") + + # get the page source and parse the HTML content into a BeautifulSoup object + parge_source = self.driver.page_source + soup = BeautifulSoup(parge_source, "html.parser") + + return soup diff --git a/CourseraTranscriptScraper/ElasticSearchJSONWriter.py b/CourseraTranscriptScraper/ElasticSearchJSONWriter.py new file mode 100644 index 000000000..0c16be742 --- /dev/null +++ b/CourseraTranscriptScraper/ElasticSearchJSONWriter.py @@ -0,0 +1,51 @@ +import json +import os +from elasticsearch import Elasticsearch + + +class ElasticSearchJSONWriter: + """ + Class to take a JSON script and write it to ElasticSearch, so it can be used in the Coursera + search extension. + The current implementation uses the project team's ElasticSearch instance, but this can be + changed by modifying the 'ES_URL' default value in the class __init__() method below. + """ + + def __init__(self, json_path: str = "./subtitles.json"): + self.url = os.environ.get( + "ES_URL", "https://ac55987c83844faa90726d4e5efe92b9.us-central1.gcp.cloud.es.io" + ) + self.user = os.environ.get("ES_USER", "elastic") + self.password = os.environ.get("ES_PASSWORD", "pciWclpLNdXuicUhXV8bhgk2") + self.json_path = json_path + self.subtitles_json = self.load_json() + + def load_json(self) -> json: + """Load JSON file from saved scraped results in preparation to be pusehd to ElasticSearch""" + try: + with open(self.json_path) as f: + subtitles_doc = f.read() + subtitles_json = json.loads(subtitles_doc) + # Should always work unless the file doesn't exist, in which case the user should be warned + except FileNotFoundError: + print(f"{self.json_path} was not found") + + return subtitles_json + + def index_subtitles(self, course_name: str) -> None: + for weeks in self.subtitles_json[course_name]: + week_val = list(weeks.keys())[0] + for week in weeks.values(): + for lecture_titles in week: + for lecture_title in lecture_titles: + for subtitles in lecture_titles[lecture_title]: + subtitles["lecture_title"] = lecture_title + subtitles["week"] = week_val + subtitles['course_name'] = course_name + self.write_to_elasticsearch(subtitles) + print(f"Successfully indexed subtitles for {course_name}") + + def write_to_elasticsearch(self, doc) -> None: + es = Elasticsearch(self.url, http_auth=(self.user, self.password)) + resp = es.index(index="subtitles", document=doc) + print(resp["result"]) diff --git a/CourseraTranscriptScraper/scrape_coursera_course.py b/CourseraTranscriptScraper/scrape_coursera_course.py new file mode 100644 index 000000000..243ff866c --- /dev/null +++ b/CourseraTranscriptScraper/scrape_coursera_course.py @@ -0,0 +1,41 @@ +import argparse +import json +from CourseraScraper import CourseraScraper +from ElasticSearchJSONWriter import ElasticSearchJSONWriter + + +def scrape_course_pipeline( + course_url: str, username: str, password: str, elastic_search_push: bool +) -> None: + # Scrape a Coursera course's transcripts into a JSON file + scraper = CourseraScraper(course_url, username, password) + scraper.run_scraper() + # Generate the JSON filename to write subtitles to from the course name + course_name = scraper.course_name + course_code = course_name.split(":")[0].replace(' ', '') + output_path = f"subtitles_{course_code}.json" + + # Writing a JSON file + with open(output_path, "w") as json_file: + json.dump(scraper.course_transcript_for_json, json_file, indent=4) + if elastic_search_push: + writer = ElasticSearchJSONWriter(output_path) + writer.index_subtitles(course_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--course_url", + required=True, + type=str, + help="URL to the landing page of the course you want to scrape. \ + Ex: https://www.coursera.org/learn/cs-410/home/", + ) + parser.add_argument("-u", "--username", required=True, type=str, help="Coursera Username") + parser.add_argument("-p", "--password", required=True, type=str, help="Coursera Password") + parser.add_argument("-e", "--elastic_search_push", action="store_true") + args = parser.parse_args() + + scrape_course_pipeline(args.course_url, args.username, args.password, args.elastic_search_push) diff --git a/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProgressReport.pdf b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProgressReport.pdf new file mode 100644 index 000000000..06f2c215f Binary files /dev/null and b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProgressReport.pdf differ diff --git a/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProposal.pdf b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProposal.pdf new file mode 100644 index 000000000..2dada5861 Binary files /dev/null and b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProposal.pdf differ diff --git a/Documentation/README_images/ChatGPT_Initialization.png b/Documentation/README_images/ChatGPT_Initialization.png new file mode 100644 index 000000000..ea13b6158 Binary files /dev/null and b/Documentation/README_images/ChatGPT_Initialization.png differ diff --git a/Documentation/README_images/ChatGPT_Query.png b/Documentation/README_images/ChatGPT_Query.png new file mode 100644 index 000000000..8478a567f Binary files /dev/null and b/Documentation/README_images/ChatGPT_Query.png differ diff --git a/Documentation/README_images/ChatGPT_Response.png b/Documentation/README_images/ChatGPT_Response.png new file mode 100644 index 000000000..802f2dcb9 Binary files /dev/null and b/Documentation/README_images/ChatGPT_Response.png differ diff --git a/Documentation/README_images/Chrome Developer Mode.png b/Documentation/README_images/Chrome Developer Mode.png new file mode 100644 index 000000000..e027bac3c Binary files /dev/null and b/Documentation/README_images/Chrome Developer Mode.png differ diff --git a/Documentation/README_images/Chrome Extension Directory.png b/Documentation/README_images/Chrome Extension Directory.png new file mode 100644 index 000000000..131143d61 Binary files /dev/null and b/Documentation/README_images/Chrome Extension Directory.png differ diff --git a/Documentation/README_images/Chrome Load Unpacked.png b/Documentation/README_images/Chrome Load Unpacked.png new file mode 100644 index 000000000..0c5beaa14 Binary files /dev/null and b/Documentation/README_images/Chrome Load Unpacked.png differ diff --git a/Documentation/README_images/ChromeExtension_Activation.png b/Documentation/README_images/ChromeExtension_Activation.png new file mode 100644 index 000000000..ccde9a850 Binary files /dev/null and b/Documentation/README_images/ChromeExtension_Activation.png differ diff --git a/Documentation/README_images/ChromeExtension_Query.png b/Documentation/README_images/ChromeExtension_Query.png new file mode 100644 index 000000000..c4f586291 Binary files /dev/null and b/Documentation/README_images/ChromeExtension_Query.png differ diff --git a/Documentation/README_images/ChromeExtension_Results.png b/Documentation/README_images/ChromeExtension_Results.png new file mode 100644 index 000000000..0bf0dfd43 Binary files /dev/null and b/Documentation/README_images/ChromeExtension_Results.png differ diff --git a/Documentation/README_images/CourseraScraper_LoginCaptcha.png b/Documentation/README_images/CourseraScraper_LoginCaptcha.png new file mode 100644 index 000000000..656af0542 Binary files /dev/null and b/Documentation/README_images/CourseraScraper_LoginCaptcha.png differ diff --git a/Documentation/README_images/CourseraScraper_LoginPostCaptcha.png b/Documentation/README_images/CourseraScraper_LoginPostCaptcha.png new file mode 100644 index 000000000..9611e853c Binary files /dev/null and b/Documentation/README_images/CourseraScraper_LoginPostCaptcha.png differ diff --git a/Documentation/README_images/CourseraScraper_SuccessfulESPush.png b/Documentation/README_images/CourseraScraper_SuccessfulESPush.png new file mode 100644 index 000000000..cd328e1ee Binary files /dev/null and b/Documentation/README_images/CourseraScraper_SuccessfulESPush.png differ diff --git a/Documentation/README_images/CourseraScraper_SuccessfulScrapes.png b/Documentation/README_images/CourseraScraper_SuccessfulScrapes.png new file mode 100644 index 000000000..31e27c79c Binary files /dev/null and b/Documentation/README_images/CourseraScraper_SuccessfulScrapes.png differ diff --git a/Documentation/README_images/WorkflowDiagram.png b/Documentation/README_images/WorkflowDiagram.png new file mode 100644 index 000000000..6b8472ed7 Binary files /dev/null and b/Documentation/README_images/WorkflowDiagram.png differ diff --git a/README.md b/README.md index a7b40d2cc..5829d563d 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,132 @@ -# CourseProject +# CS410 CourseProject (Team CAHJ) - Coursera Search with ChatGPT Querier -Please fork this repository and paste the github link of your fork on Microsoft CMT. Detailed instructions are on Coursera under Week 1: Course Project Overview/Week 9 Activities. +## Project Overview +### Problem Statement +For our project, we wanted to solve two problems: 1) the difficulty of searching for information in Coursera videos, and 2) the difficulty of synthesizing class information into a digestable unit of content. We solve these problems with two products: a Chrome Extension to search Coursera videos, and a ChatGPT Integration is queryable, leveraging LLMs and the emerging technology of AI in order to provide a study tool and information synthesizer for UIUC students to use. + +Essentialy, our project provides a way for UIUC students using the Coursera platform for their degree to find concepts in their video lectures without having to tediously scroll through each video in a course and use their browser's search function to find a term. Often, a class can have many weeks of content, and each week can have many videos. If you know there's a video that you want to re-watch in order to study a concept, but can't remember in which video (or even which week!) that concept can be found, this project will hopefully make your life a lot easier! In addition, the ChatGPT module is a queryable script trained on the Coursera video transcripts that power the Chrome Extension, allowing students to query a specialized verison of ChatGPT about their course content. + +### Project Demo Video +Please find a demo video of the Coursera search functionality and the ChatGPT integration at [this YouTube link](https://youtu.be/wSEEVjIqoYE). +Note that the Coursera transcript scraper is not included in this demo video because of privacy considerations (it requires login information to be input into the shell at runtime). + +### Project Workflow +Overall, the project consists of three parts: +1. Coursera Course Transcript Scraper +2. ChatGPT Integration +3. Coursera Search Chrome Extension + +The Coursera Course Transcript Scraper is necessary because dynamically scrapping the course video transcripts simply takes too long; it would make the search function untenably tedious. Similarly, without scraped data, the ChatGPT integration would not be able to be trained correctly. The Transcript Scraper utilized Python, particularly the `beautifulsoup` and `selenium` modules to scrape video transcripts from a course provided by the user, and then indexes those transcripts to `ElasticSearch`. This indexed data is what powers the Chrome Extension and ChatGPT Integration. + +The ChatGPT Integration, also written in Python, uses the `langchain` module to split and store the course transcript data into chunks, which are then fed into the GPT-API via the `openai` module as context with the user's query. This allows the LLM to provide an answer that is informed by the Coursera course content. + +The Chrome Extension UI is written in HTML and CSS, while the functionality uses JavaScript. + +![Workflow Diagram for Project](./Documentation/README_images/WorkflowDiagram.png) + + +### Project Requirements +This project is fairly straightforward with regards to requirements on the user's machine, but there are a few baselines that are required to be hit: +- The project requires Google Chrome to work. +- The project requires ChromeDriver, maintained by Chronium, to be installed in the root directory of the project in order to enable scraping (see Step 2 under Installation Instructions, below). +- The project requires a working installation of Python to scrape new course content. The file `requirements.txt` includes the packages necessary for the script to run. If you plan to scrape new course content into the project ElasticSearch index, please ensure your Python environment satisfies these requirements. +- As the extension is not deployed to the Google Chrome Web Store, it requires a local copy of the codebase on the user's computer (see Step 1 under Installation Instructions, below). +- In order for the ChatGPT functionality to work, you will need an OpenAI API Key ([see here](https://platform.openai.com/api-keys)) and add that key to your environment variables as a new variable called `OPENAI_API_KEY`. Instructions for how to add environment variables can be found here: [Mac](https://phoenixnap.com/kb/set-environment-variable-mac) | [Windows](https://www.howtogeek.com/787217/how-to-edit-environment-variables-on-windows-10-or-11/) | [Linux](https://linuxize.com/post/how-to-set-and-list-environment-variables-in-linux/) + + +## Installation Instructions +Installing the extension is quite simple; all you need to do is download the code from GitHub and then activate the extension in Chrome. +A step-by-step guide for the above is below.: + +1. Pull the code from GitHub to `desiredDirectory` using your shell: + ``` + cd desiredDirectory + git clone https://github.com/christianopperman/CS410_Fall2023_CourseProject_TeamCAHJ.git + ``` +2. Install the appropriate ChromeDriver for your computer's enviornment from [this link](https://googlechromelabs.github.io/chrome-for-testing/#stable), unzip it, and move the `Google Chrome for Testing` application to the `CS410__Fall2023_CourseProject_TeamCAHJ` directory created in Step 1, above. +3. Open Google Chrome. +4. Go to the Extensions page on Google Chrome by following [this link](chrome://extensions). +5. Activate Developer Mode by toggling the switch in the upper right corner labeled `Developer mode`.
+ +[](./Documentation/README_images/Chrome%20Developer%20Mode.png) + +6. Load the extension from the codebase pulled to your computer in Step 1 by clicking the `Load unpacked` button in the top left corner:
+ +[](./Documentation/README_images/Chrome%20Load%20Unpacked.png) + +7. Select the `desiredDirectory/CS410_Fall2023_CourseProject_TeamCAHJ/ChromeExtension` directory in the popup and click `Select`
+ +[](./Documentation/README_images/Chrome%20Extension%20Directory.png) + +8. The extension should now be available to you in your Google Chrome Extensions list. + +## Usage Instructions + +### Chrome Extension + +Once installed, the Chrome Extension can be used from any page on Chrome with the following steps: +1. Open the extension from Google Chrome's Extension menu, located to the right of the URL bar. + +[](./Documentation/README_images/ChromeExtension_Activation.png) + +2. Enter your desired search term in the search field and hit `Submit`. + +[](./Documentation/README_images/ChromeExtension_Query.png) + +3. See the results. Each result is a link that will take you to the Coursera video page that is linked. + +[](./Documentation/README_images/ChromeExtension_Results.png) + +### Coursera Transcript Scraper +As mentioned in [Requirements](#requirements) above, in order to scrape your own Coursera course transcripts into the extension, you will need a working version of Python that satisfies the required packages outlined in the `CourseraTranscriptScraper\requirements.txt` file. +Once you have that, scraping a new course into ElasticSearch is very easy: +1. Navigate to `desiredDirectory/CS410_Fall2023_CourseProject_TeamCAHJ/CourseraTranscriptScraper` in your shell +2. Call the course scraper script with, with the following command line arguments: +``` +python scrape_coursera_course.py -c "course_url" -u "coursera_username" -p "coursera_password" [-e] +``` +* Required Arguments + * -c : The link to the landing page of the Coursera course you'd like to scrape + * -u : The username to your Coursera account which has access to the course you'd like to scrape + * -p : The password to your Coursera account which has access to the course you'd like to scrape + +* Optional Arguments: + * -e : A boolean flag. If included, the script will automatically push the scraped course transcriptions to ElasticSearch after saving them to disk. If not included, the transcriptions will be saved to disk but not pushed to ElasticSearch. + +3. Once you run the above command, a window will pop up and automatically log you into Coursera. It is likely that you will be required to complete a CAPTCHA. +4. Once you complete the CAPTCHA, return to your shell and press Enter, as prompted. + +[](./Documentation/README_images/CourseraScraper_LoginPostCaptcha.png) + +5. The script will begin scraping, as evidenced by the pop-up window navigating between video pages in the course and the `Retrieved` messages in the shell window. + +[](./Documentation/README_images/CourseraScraper_SuccessfulScrapes.png) + +6. The script will write any scraped transcriptions to the filepath `subtitles_cs###.json`, where `###` is the three digit course code of the class you are scraping. +7. If the `-e` flag was passed to the script, the script will automatically push the scraped course's transcriptions to ElasticSearch. +8. Once the script is finished, you will see a success message, and the web driver window will automatically exit. + +[](./Documentation/README_images/CourseraScraper_SuccessfulESPush.png) + +#### Note +Please be careful not to scrape too many courses at once. Coursera may block you if you issue too many requests to it in too short a time frame. As such, we recommend that you only scrape one course at a time. + +### ChatGPT Integration +To use the ChatGPT Integration function, ensure all Python package requirements are installed and you have your OpenAI API Key set up as an environment variable called `OPENAI_API_KEY`, and then follow these steps: + +1. Navigate to `desiredDirectory/CS410_Fall2023_CourseProject_TeamCAHJ/ChatGPTQuerier` in your terminal shell +2. Run the `chat_coursera.py` script with `python3 chat_coursera.py` + +[](./Documentation/README_images/ChatGPT_Initialization.png) + +3. Enter your query into the shell and hit `Enter` + +[](./Documentation/README_images/ChatGPT_Query.png) + +4. The results of the ChatGPT query, informed by the course transcripts, will print to the shell + +[](./Documentation/README_images/ChatGPT_Response.png) + +## Future Improvements + +While we didn't have enough time to figure this out, we would have really liked to integrate the two Python components (Coursera Course Transcript Scraper and ChatGPT Integration) into the Chrome Extension as well. As far as we could tell, triggering a local Python script from a Chrome extension is non-trivial (if possible at all), and we had neither the time nor the funds to host the scripts on the cloud for this project. However, we would have loved to have multiple tabs on our Chrome extension, one with an entry point for scraping course transcripts (that could run in the background) and one with a text-entry field that would allow you to query the ChatGPT integration directly from Chrome. \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 000000000..43e9854c8 --- /dev/null +++ b/package.json @@ -0,0 +1,8 @@ +{ + "name": "CS410_Fall2023_CourseProject_TeamCAHJ", + "version": "1.0.0", + "dependencies": { + "@elastic/elasticsearch": "^8.10.0", + "elasticsearch": "^16.7.3" + } +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..3c642459d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +beautifulsoup4==4.12.2 +elasticsearch==7.17.0 +langchain==0.0.350 +openai==0.28.1 +python-dotenv==1.0.0 +selenium==4.9.0 +webdriver_manager==4.0.1