In [1]:
!pip install selenium
!pip install webdriver-manager




In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import regex as re
import pandas as pd
import time

In [3]:
# Specify the path to your ChromeDriver
service = Service(executable_path="C:/Users/almas/Downloads/chromedriver.exe")

In [4]:
import re
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Global variables
visited_links = set()
results = []

def extract_english_text(text):
    """
    Extract only English words, numbers, and recognized English punctuation from the input text.
    """
    # Match English words, numbers, and specific punctuation
    pattern = r"[A-Za-z0-9\s.,!?\"'():;%-]"
    cleaned_text = ''.join(re.findall(pattern, text))
    return cleaned_text


# Function to check if a page requires login or restricted access
def requires_login_or_access_denied(driver):
    """Check if the page indicates login or restricted access is required."""
    page_source = driver.page_source.lower()
    login_indicators = [
        "login", "access denied", "restricted access", "please sign in",
        "requires login", "permission required", "sign in to continue"
    ]
    return any(indicator in page_source for indicator in login_indicators)

# Recursive function to scrape links within the "Courses" section
def scrape_courses_section(driver, url, section_name, current_depth=1, max_depth=3):
    """Scrape links within a section up to a specified depth."""
    if url in visited_links or current_depth > max_depth:
        return
    visited_links.add(url)

    driver.get(url)
    time.sleep(2)
    wait = WebDriverWait(driver, 10)

    try:
        # Check for restricted access
        if requires_login_or_access_denied(driver):
            results.append({
                'Section': section_name,
                'Link': url,
                'Title': 'Login Required',
                'Text': 'Requires login to access this content.'
            })
            return

        # Extract main content
        main_content_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'main-content')))
        header_element = main_content_element.find_element(By.CLASS_NAME, 'entry-header')
        title = extract_english_text(header_element.text)

        content_element = main_content_element.find_element(By.CLASS_NAME, 'entry-content')
        content = extract_english_text(content_element.text)

        results.append({
            'Section': section_name,
            'Link': url,
            'Title': title,
            'Text': content
        })

        # Recursively visit internal links
        links_in_content = content_element.find_elements(By.TAG_NAME, 'a')
        internal_links = [link.get_attribute('href') for link in links_in_content if link.get_attribute('href')]

        for link in internal_links:
            if link and link not in visited_links:
                scrape_courses_section(driver, link, section_name, current_depth + 1, max_depth)
                driver.get(url)
                time.sleep(2)

    except Exception as e:
        print(f"Error scraping {url}: {e}")

# Function to extract main content
def extract_main_content(url, section_name):
    """Extract and clean title and text from the main content."""
    if url in visited_links:
        return None
    visited_links.add(url)

    driver.get(url)
    time.sleep(2)

    try:
        main_content = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'main-content')))
        entry_header = main_content.find_element(By.CLASS_NAME, 'entry-header')
        title = extract_english_text(entry_header.find_element(By.CLASS_NAME, 'entry-title').text)

        entry_content = main_content.find_element(By.CLASS_NAME, 'entry-content')
        general_text = extract_english_text(entry_content.get_attribute('innerText'))

        results.append({
            'Section': section_name,
            'Link': url,
            'Title': title,
            'Text': general_text
        })
        return main_content

    except Exception as e:
        print(f"Error extracting main content from {url}: {e}")
        return None

def scrape_internal_links(main_content, section_name):
    """Identify and visit internal links from the main content."""
    try:
        # Find all anchor tags within the entry-content div
        anchor_tags = main_content.find_elements(By.TAG_NAME, 'a')
        links_to_visit = []

        for anchor in anchor_tags:
            href = anchor.get_attribute('href')
            if href and "dil.umbc.edu" in href and href not in visited_links:
                links_to_visit.append(href)

        # Visit each link and extract data
        for link in links_to_visit:
            main_content = extract_main_content(link, section_name)
            if main_content:
                extract_collapsible_sections(main_content, link, section_name)

    except Exception as e:
        print(f"Error finding internal links: {e}")

def extract_main_content_notitle(url, section_name):
    """Extract title and text from the entry-header, and detailed subheading content within entry-content."""
    if url in visited_links:
        return None  # Skip if the URL has already been visited
    visited_links.add(url)

    driver.get(url)
    time.sleep(2)

    try:
        # Wait for the main content to load
        main_content = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'main-content')))
        

        # Extract general text from entry-content
        entry_content = main_content.find_element(By.CLASS_NAME, 'entry-content')
        general_text = entry_content.get_attribute('innerText').strip()
        
        # Store the main title and general text for the section
        results.append({
            'Section': section_name,
            'Link': url,
            'Title': f"{section_name} - Main Content",
            'Text': general_text
        })

        return main_content

    except Exception as e:
        print(f"Error extracting main content from {url}: {e}")
        return None  
# Function to extract collapsible sections
def extract_collapsible_sections(main_content, url, section_name):
    """Extract collapsible sections and their content within the main content."""
    sections = main_content.find_elements(By.CLASS_NAME, 'sights-expander-wrapper')
    for section in sections:
        try:
            title_element = section.find_element(By.TAG_NAME, 'h5')
            title = extract_english_text(title_element.text)

            expander_trigger = section.find_element(By.CLASS_NAME, 'sights-expander-trigger')
            expander_trigger.click()
            WebDriverWait(driver, 5).until(
                EC.visibility_of(section.find_element(By.CLASS_NAME, 'sights-expander-content'))
            )

            content_div = section.find_element(By.CLASS_NAME, 'sights-expander-content')
            text = extract_english_text(content_div.text)

            results.append({
                'Section': section_name,
                'Link': url,
                'Title': title,
                'Text': text
            })

        except Exception as e:
            print(f"Error extracting collapsible section from {url}: {e}")

# Function to scrape links from the sidebar
def scrape_sidebar_links(url, section_name):
    """Scrape all links found in the sidebar and extract content."""
    driver.get(url)
    time.sleep(2)

    try:
        sidebar_links = driver.find_elements(By.CSS_SELECTOR, '.sidebar a')
        for link in sidebar_links:
            href = link.get_attribute('href')
            if href and href not in visited_links:
                extract_main_content(href, section_name)
                time.sleep(2)

    except Exception as e:
        print(f"Error extracting sidebar links from {url}: {e}")

# Function to extract dropdown content
def extract_dropdown_content(url, section_name):
    """Extract content from dropdown sections on the page."""
    driver.get(url)
    time.sleep(2)

    try:
        dropdown_sections = driver.find_elements(By.CLASS_NAME, 'sights-expander-wrapper')
        for section in dropdown_sections:
            try:
                question_element = section.find_element(By.CLASS_NAME, 'mceEditable')
                question_text = extract_english_text(question_element.text)

                expander_trigger = section.find_element(By.CLASS_NAME, 'sights-expander-trigger')
                driver.execute_script("arguments[0].click();", expander_trigger)
                time.sleep(1)

                answer_id = expander_trigger.get_attribute('aria-controls')
                answer_content = WebDriverWait(driver, 5).until(
                    EC.visibility_of_element_located((By.ID, answer_id))
                )
                answer_text = extract_english_text(answer_content.text)

                results.append({
                    'Section': section_name,
                    'Link': url,
                    'Title': question_text,
                    'Text': answer_text
                })

            except Exception as e:
                print(f"Error extracting dropdown content in {url}: {e}")

    except Exception as e:
        print(f"Error loading dropdown sections from {url}: {e}")


In [5]:
# # Set up Chrome options
# chrome_options = Options()
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")
# # chrome_options.add_argument("--headless")  # Uncomment to run in headless mode

# # Initialize the WebDriver
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # List of main sections with URLs
# sections = {
#     "Courses": 'https://dil.umbc.edu/courses/',
#     "Pathways & Certificates": 'https://dil.umbc.edu/pathways-and-certificates/',
#     "Advising & Resources": 'https://dil.umbc.edu/resources/',
#     "Policies": 'https://dil.umbc.edu/policies/',
#     "Prospective Students": 'https://dil.umbc.edu/prospective-students/',
#     "Faculty": 'https://dil.umbc.edu/faculty/'
# }

# # Initialize a list to store the results
# results = []
# visited_links = set()

# # Main scraping process for each section
# try:
#     for section_name, url in sections.items():
#         print(f"Scraping section: {section_name}")
        
#         if section_name == "Courses":
#             scrape_courses_section(driver, url,section_name)

#         # Step 1: Extract content from the main page
#         main_content = extract_main_content(url, section_name)

#         # Step 2: Extract collapsible sections (if any)
#         if main_content:
#             extract_collapsible_sections(main_content, url, section_name)

#             # Step 3: Identify and visit internal links within the main content
#             scrape_internal_links(main_content, section_name)

#             # Step 4: Scrape all links in the sidebar
#             scrape_sidebar_links(url, section_name)

# finally:
#     # Close the browser
#     driver.quit()

# Create a DataFrame from the collected data and reorder columns
df_dil = pd.DataFrame(results, columns=['Section', 'Link', 'Title', 'Text'])

# Save the DataFrame to an CSV file
df_dil.to_csv('C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/dil_scraped_data.csv', index=False)


import re
import time
import logging
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

# Configure logging
logging.basicConfig(
    filename="C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/scraping_log.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def log_update_date(message):
    """Log a custom message with the current date."""
    current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    logging.info(f"{message} - Date: {current_date}")

# Scraping logic with integrated logging
try:
    log_update_date("Scraping started")

    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    # chrome_options.add_argument("--headless")  # Uncomment to run in headless mode
    
    # Initialize the WebDriver
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # List of main sections with URLs
    sections = {
        "Courses": 'https://dil.umbc.edu/courses/',
        "Pathways & Certificates": 'https://dil.umbc.edu/pathways-and-certificates/',
        "Advising & Resources": 'https://dil.umbc.edu/resources/',
        "Policies": 'https://dil.umbc.edu/policies/',
        "Prospective Students": 'https://dil.umbc.edu/prospective-students/',
        "Faculty": 'https://dil.umbc.edu/faculty/'
    }
    
    # Initialize a list to store the results
    results = []
    visited_links = set()
    
    # Main scraping process for each section
    try:
        for section_name, url in sections.items():
            print(f"Scraping section: {section_name}")
            
            if section_name == "Courses":
                scrape_courses_section(driver, url,section_name)
    
            # Step 1: Extract content from the main page
            main_content = extract_main_content(url, section_name)
    
            # Step 2: Extract collapsible sections (if any)
            if main_content:
                extract_collapsible_sections(main_content, url, section_name)
    
                # Step 3: Identify and visit internal links within the main content
                scrape_internal_links(main_content, section_name)
    
                # Step 4: Scrape all links in the sidebar
                scrape_sidebar_links(url, section_name)
    
    finally:
        # Close the browser
        driver.quit()

    log_update_date("Scraping completed")

finally:
    # Save the DataFrame to a CSV file
    df_dil = pd.DataFrame(results, columns=['Section', 'Link', 'Title', 'Text'])
    df_dil.to_csv('C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/dil_scraped_data.csv', index=False)
    logging.info("CSV saved successfully.")


Scraping section: Courses
Error scraping https://professionalprograms.umbc.edu/data-science/masters-of-professional-studies-data-science/: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7FD3EE1F5+2972373]
	(No symbol) [0x00007FF7FD0855F0]
	(No symbol) [0x00007FF7FCF257FA]
	(No symbol) [0x00007FF7FCF75A3E]
	(No symbol) [0x00007FF7FCF75D2C]
	(No symbol) [0x00007FF7FCFBEAB7]
	(No symbol) [0x00007FF7FCF9BABF]
	(No symbol) [0x00007FF7FCFBB8CC]
	(No symbol) [0x00007FF7FCF9B823]
	(No symbol) [0x00007FF7FCF675E8]
	(No symbol) [0x00007FF7FCF68751]
	GetHandleVerifier [0x00007FF7FD4147BD+3129501]
	GetHandleVerifier [0x00007FF7FD464D00+3458528]
	GetHandleVerifier [0x00007FF7FD45B05D+3418429]
	GetHandleVerifier [0x00007FF7FD1E687B+844123]
	(No symbol) [0x00007FF7FD090AFF]
	(No symbol) [0x00007FF7FD08C6D4]
	(No symbol) [0x00007FF7FD08C86D]
	(No symbol) [0x00007FF7FD07BD79]
	BaseThreadInitThunk [0x00007FFECCF2259D+29]
	RtlUserThreadStart [0x00007FFECE6AAF38+40]

Scraping section: Pathways & Certi

In [11]:
df_dil.shape

(107, 4)

In [12]:
df_dil.head(5)

Unnamed: 0,Section,Link,Title,Text
0,Courses,https://dil.umbc.edu/courses/,Courses,Where can I find the\nSyllabi of the Data Scie...
1,Courses,https://dil.umbc.edu/syllabi-of-data-science-c...,Syllabi of Data Science Courses,Core Data Science Courses\nAll the masters stu...
2,Courses,https://dil.umbc.edu/courses/data-601-introduc...,DATA 601 Introduction to Data Science,Description: The goal of this class is to give...
3,Courses,https://dil.umbc.edu/courses/data-602-introduc...,DATA 602 Introduction to Data Analysis and Mac...,Description: This course provides a broad intr...
4,Courses,https://dil.umbc.edu/courses/data-603-platform...,DATA 603 Platforms for Big Data Processing,Description: The goal of this course is to int...


In [16]:
import logging
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Configure logging
logging.basicConfig(
    filename="C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/scraping_isss_log.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def log_update_date(message):
    """Log a custom message with the current date."""
    current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    logging.info(f"{message} - Date: {current_date}")

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# chrome_options.add_argument("--headless")  # Uncomment to run in headless mode

# Initialize the WebDriver
logging.info("Initializing WebDriver")
# service = Service('path/to/chromedriver')  # Replace with the correct path
driver = webdriver.Chrome(service=service, options=chrome_options)
logging.info("WebDriver initialized successfully")

# Initialize a list to store the results
results = []
visited_links = set()

# Define the F-1 Students sub-sections with their URLs
sub_sections = {
    "Current Students: General": "https://isss.umbc.edu/international-students-f-1/current-students/",
    "Current Students: Employment": "https://isss.umbc.edu/f-1-students/current-students-employment/",
    "Working On-Campus":"https://isss.umbc.edu/international-students-f-1/current-students-employment/working-on-campus/",
    "Economoic Hardship Work Authorization":"https://isss.umbc.edu/international-students-f-1/current-students-employment/economic-hardship-work-authorization/",
    "Understanding Your Documents": "https://isss.umbc.edu/f-1-students/understanding-your-documents/",
    "Social Security Number (SSN)": "https://isss.umbc.edu/f-1-students/social-security-number/",
    "US Taxes": "https://isss.umbc.edu/international-students-f-1/current-students/understanding-your-tax-documents/",
    "Applying for a Maryland Driver’s License & Getting a State ID": "https://isss.umbc.edu/resources/transportation/driving-in-maryland-and-getting-a-state-id/",
    "Change of Immigration Status": "https://isss.umbc.edu/change-of-status/"
}

# List of URLs with dropdowns
dropdown_pages = {
    "Internships and International Students": "https://dil.umbc.edu/resources/internships-and-international-students/",
    "OPT and OPT STEM Information": "https://isss.umbc.edu/opt-and-opt-stem-information/",
    "Working Off-Campus": "https://isss.umbc.edu/international-students-f-1/current-students-employment/working-off-campus/"
}

try:
    # Scrape each specified sub-section and its sidebar links
    for sub_section_name, link in sub_sections.items():
        log_update_date(f"Scraping started for section: {sub_section_name}")
        
        # Step 1: Extract main content
        main_content = extract_main_content_notitle(link, sub_section_name)
        
        if main_content:
            logging.info(f"Successfully scraped main content for section: {sub_section_name}")
        else:
            logging.warning(f"No main content found for section: {sub_section_name}")

    # Scrape each page with dropdowns
    for section_name, url in dropdown_pages.items():
        log_update_date(f"Scraping dropdown section: {section_name}")
        
        # Step 1: Extract content from the main page
        main_content = extract_main_content_notitle(url, section_name)
        
        # Step 2: Extract collapsible sections (if any)
        if main_content:
            extract_dropdown_content(url, section_name)
            logging.info(f"Successfully extracted dropdown content for section: {section_name}")
        else:
            logging.warning(f"No dropdown content found for section: {section_name}")

finally:
    logging.info("Closing WebDriver")
    driver.quit()
    logging.info("WebDriver closed successfully")

# Save the DataFrame to a CSV file
df_isss = pd.DataFrame(results, columns=['Section', 'Link', 'Title', 'Text'])
output_file = 'C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/isss_scraped_data.csv'
df_isss.to_csv(output_file, index=False)
logging.info(f"Data saved to CSV file: {output_file}")


Error extracting main content from https://isss.umbc.edu/international-students-f-1/current-students-employment/working-off-campus/: Message: no such element: Unable to locate element: {"method":"css selector","selector":".entry-content"}
  (Session info: chrome=130.0.6723.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7FD3EE1F5+2972373]
	(No symbol) [0x00007FF7FD0855F0]
	(No symbol) [0x00007FF7FCF257FA]
	(No symbol) [0x00007FF7FCF75A3E]
	(No symbol) [0x00007FF7FCF75D2C]
	(No symbol) [0x00007FF7FCF6939C]
	(No symbol) [0x00007FF7FCF9BABF]
	(No symbol) [0x00007FF7FCF69266]
	(No symbol) [0x00007FF7FCF9BC90]
	(No symbol) [0x00007FF7FCFBB8CC]
	(No symbol) [0x00007FF7FCF9B823]
	(No symbol) [0x00007FF7FCF675E8]
	(No symbol) [0x00007FF7FCF68751]
	GetHandleVerifier [0x00007FF7FD4147BD+3129501]
	GetHandleVerifier [0x00007FF7FD464D00+3458528]
	GetHandleV

In [18]:
df_isss.shape

(31, 4)

In [20]:
df_isss.head(5)

Unnamed: 0,Section,Link,Title,Text
0,Current Students: General,https://isss.umbc.edu/international-students-f...,Current Students: General - Main Content,In this section you will find more information...
1,Current Students: Employment,https://isss.umbc.edu/f-1-students/current-stu...,Current Students: Employment - Main Content,International students have very specific kind...
2,Working On-Campus,https://isss.umbc.edu/international-students-f...,Working On-Campus - Main Content,F-1 International students are able to work up...
3,Economoic Hardship Work Authorization,https://isss.umbc.edu/international-students-f...,Economoic Hardship Work Authorization - Main C...,If you are an F-1 student experiencing severe ...
4,Understanding Your Documents,https://isss.umbc.edu/f-1-students/understandi...,Understanding Your Documents - Main Content,While you are a student in the US there are a ...


In [22]:
import logging
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Configure logging
logging.basicConfig(
    filename="C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/research_scraping_log.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def log_update_date(message):
    """Log a custom message with the current date."""
    current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    logging.info(f"{message} - Date: {current_date}")

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# chrome_options.add_argument("--headless")  # Uncomment to run in headless mode

logging.info("Initializing WebDriver")
# service = Service('path/to/chromedriver')  # Replace with the correct path
driver = webdriver.Chrome(service=service, options=chrome_options)
logging.info("WebDriver initialized successfully")

# Define research links with proper formatting
Research_links = {
    "CSEE Research Areas": "https://www.csee.umbc.edu/csee-research-areas/",
    "Research Centers": "https://www.csee.umbc.edu/research-focus-areas-and-centers/",
    "Research Labs CSEE": "https://www.csee.umbc.edu/research/research-labs/",
    "AI Home": "https://ai.umbc.edu/",
    "Research faculty": "https://ai.umbc.edu/ai-faculty/",
    "Research Labs AI": "https://ai.umbc.edu/labs-groups/",
    "Cyber Security labs": "https://cybersecurity.umbc.edu/training/labs/"
}

# Initialize a list to store the results
results = []
visited_links = set()

# Define the main scraping process
try:
    log_update_date("Scraping started")

    for section_name, url in Research_links.items():
        log_update_date(f"Scraping started for section: {section_name}")
        
        # Extract content from the main page for other sections
        main_content = extract_main_content(url, section_name)
        
        if main_content:
            logging.info(f"Successfully scraped main content for section: {section_name}")
            
            # Step 2: Extract collapsible sections (if any)
            extract_collapsible_sections(main_content, url, section_name)
            logging.info(f"Collapsible sections extracted for section: {section_name}")

            # Step 3: Identify and visit internal links within the main content
            scrape_internal_links(main_content, section_name)
            logging.info(f"Internal links scraped for section: {section_name}")

            # Step 4: Scrape all links in the sidebar
            scrape_sidebar_links(url, section_name)
            logging.info(f"Sidebar links scraped for section: {section_name}")
        else:
            logging.warning(f"No main content found for section: {section_name}")

finally:
    logging.info("Closing WebDriver")
    driver.quit()
    logging.info("WebDriver closed successfully")

# Convert results to a DataFrame
df_csee = pd.DataFrame(results, columns=['Section', 'Link', 'Title', 'Text'])
output_file = "C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/csee_scraped_data.csv"
df_csee.to_csv(output_file, index=False)
logging.info(f"Data saved to CSV file: {output_file}")


Error extracting sidebar links from https://www.csee.umbc.edu/csee-research-areas/: Message: stale element reference: stale element not found
  (Session info: chrome=130.0.6723.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7FD3EE1F5+2972373]
	(No symbol) [0x00007FF7FD0855F0]
	(No symbol) [0x00007FF7FCF257FA]
	(No symbol) [0x00007FF7FCF37CDB]
	(No symbol) [0x00007FF7FCF2CF04]
	(No symbol) [0x00007FF7FCF2D089]
	(No symbol) [0x00007FF7FCF2AE09]
	(No symbol) [0x00007FF7FCF2E8FF]
	(No symbol) [0x00007FF7FCFBCBB0]
	(No symbol) [0x00007FF7FCF9BA7A]
	(No symbol) [0x00007FF7FCFBB8CC]
	(No symbol) [0x00007FF7FCF9B823]
	(No symbol) [0x00007FF7FCF675E8]
	(No symbol) [0x00007FF7FCF68751]
	GetHandleVerifier [0x00007FF7FD4147BD+3129501]
	GetHandleVerifier [0x00007FF7FD464D00+3458528]
	GetHandleVerifier [0x00007FF7FD45B05D+3418429]
	GetHandleVerifier

In [24]:
# Save the DataFrame to an CSV file
df_csee.to_csv('C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/csee_scraped_data.csv', index=False)

In [26]:
df_csee.shape

(11, 4)

In [28]:
# Set display options to show full content without truncation
pd.set_option('display.max_colwidth', None)

text = df_csee[df_csee['Title'] == 'CSEE Research Areas']['Text']
print(text.to_string(index=False))

# Reset the display option to default after printing, if desired
pd.reset_option('display.max_colwidth')


Communications and Photonics\nTopics: Cognitive Radio,  Nonlinear and Integrated Optics, Optical Fibers, Wireless Communications and Networking\nFaculty: Gary Carter, Fow-Sen Choa, Anthony Johnson, Seung-Jun Kim, Curtis Menyuk, Dmitri Perkins, Ergun Simsek, Li Yan, Mohamed Younis\nComputer Architecture and Hardware Systems\nTopics: Hardware Security, Low-power IC Design, Multicore Systems, Neuromorphic Computing, VLSI DesignTest\nFaculty: Riadul Islam, Naghmeh Karimi, Chenchen Liu, Dhananjay Phatak, Ryan Robucci\nCyber-Physical Systems\nTopics: IoT, Networking, Network Protocols and Management, Network Security, Secure Communication, Smart Power Grids\nFaculty: Nilanjan Banerjee, Riadul Islam, Anupam Joshi, Naghmeh Karimi, Seung-Jun Kim, Dong Li, Dmitri Perkins, Ryan Robucci, Deepinder Sidhu, Mohamed Younis, Roberto Yus\nGraphics and Visualization\nTopics: Character Animation, Computer Graphics, ScientificData Visualization\nFaculty: Adam Bargteil, Don Engel, Mark Olano, Rebecca Willia

In [30]:
pip install langchain_community

Note: you may need to restart the kernel to use updated packages.


In [36]:
import os
from langchain_groq import ChatGroq
# from langchain_community.document_loaders import WebBaseLoader
# from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import OpenAIEmbeddings
# from langchain.vectorstores.cassandra import Cassandra
# import cassio
from dotenv import load_dotenv
load_dotenv()

False

In [38]:
# pip --version

In [40]:
# !python -m pip install --upgrade pip

In [42]:
df_isss.shape

(31, 4)

In [44]:
df_csee.shape

(11, 4)

In [46]:
df_dil.shape

(107, 4)

In [48]:
import pandas as pd

# df_csee = pd.DataFrame(results, columns=['Section', 'Link', 'Title', 'Text'])
# df_isss = pd.DataFrame(results, columns=['Section', 'Link', 'Title', 'Text'])
# df_dil = pd.DataFrame(results, columns=['Section', 'Link', 'Title', 'Text'])

# Combine all three DataFrames into one
df_combined = pd.concat([df_csee, df_isss, df_dil], ignore_index=True)

# Display the combined DataFrame
print(df_combined)


                 Section                                               Link  \
0    CSEE Research Areas     https://www.csee.umbc.edu/csee-research-areas/   
1    CSEE Research Areas                https://www.csee.umbc.edu/research/   
2       Research Centers  https://www.csee.umbc.edu/research-focus-areas...   
3       Research Centers  https://www.csee.umbc.edu/research/research-labs/   
4                AI Home                               https://ai.umbc.edu/   
..                   ...                                                ...   
144              Faculty       https://dil.umbc.edu/faculty/person/yn78817/   
145              Faculty       https://dil.umbc.edu/faculty/person/qa97378/   
146              Faculty       https://dil.umbc.edu/faculty/person/qo08734/   
147              Faculty       https://dil.umbc.edu/faculty/person/bd53800/   
148              Faculty                    mailto:datascience-mps@umbc.edu   

                                Title  \
0         

In [50]:
df_combined.tail(5)

Unnamed: 0,Section,Link,Title,Text
144,Faculty,https://dil.umbc.edu/faculty/person/yn78817/,John H Wan,Back to Directory List\n\nJohn H Wan\n\nInstr...
145,Faculty,https://dil.umbc.edu/faculty/person/qa97378/,Chaojie (Jay) Wang,Back to Directory List\n\nChaojie (Jay) Wang\...
146,Faculty,https://dil.umbc.edu/faculty/person/qo08734/,Xin Xue,Back to Directory List\n\nXin Xue\n\nInstruct...
147,Faculty,https://dil.umbc.edu/faculty/person/bd53800/,Waleed A Youssef,Back to Directory List\n\nWaleed A Youssef\n\...
148,Faculty,mailto:datascience-mps@umbc.edu,Faculty,Full-Time Faculty Members\nAjinkya Shishir Bor...


In [54]:
pip install langchain_groq

Note: you may need to restart the kernel to use updated packages.


In [55]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [56]:
pip install chromadb

Note: you may need to restart the kernel to use updated packages.


In [57]:
pip install langchain_openai

Note: you may need to restart the kernel to use updated packages.


In [64]:
%pip install -qU langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [52]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import pandas as pd

# # Example DataFrame (replace with your df_combined)
# df_combined = pd.DataFrame([
#     {"Section": "CSEE", "Link": "http://example.com/1", "Title": "Example 1", "Text": "This is a long text that needs splitting."},
#     {"Section": "ISSS", "Link": "http://example.com/2", "Title": "Example 2", "Text": "Another long text example for testing splitting functionality."}
# ])

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)

# List to store split documents
split_docs = []

# Iterate through each row of the DataFrame
for _, row in df_combined.iterrows():
    # Create a Document object
    text_document = Document(page_content=row["Text"], metadata={
        "Section": row["Section"],
        "Link": row["Link"],
        "Title": row["Title"]
    })
    
    # Split the Document into chunks
    docs = text_splitter.split_documents([text_document])
    
    # Append each chunk along with metadata to the result
    for doc in docs:
        split_docs.append({
            "Section": doc.metadata["Section"],
            "Link": doc.metadata["Link"],
            "Title": doc.metadata["Title"],
            "Chunk": doc.page_content
        })

# Create a new DataFrame with split chunks
df_split = pd.DataFrame(split_docs)

# Display the first few rows of the split DataFrame
print(df_split.head())


               Section                                            Link  \
0  CSEE Research Areas  https://www.csee.umbc.edu/csee-research-areas/   
1  CSEE Research Areas  https://www.csee.umbc.edu/csee-research-areas/   
2  CSEE Research Areas  https://www.csee.umbc.edu/csee-research-areas/   
3  CSEE Research Areas             https://www.csee.umbc.edu/research/   
4  CSEE Research Areas             https://www.csee.umbc.edu/research/   

                 Title                                              Chunk  
0  CSEE Research Areas  Communications and Photonics\nTopics: Cognitiv...  
1  CSEE Research Areas  Graphics and Visualization\nTopics: Character ...  
2  CSEE Research Areas  Security and Privacy\nTopics: Computer Securit...  
3             Research  Our department has a strong and diverse resear...  
4             Research  Find UMBC Faculty Experts\n\nAre you looking f...  


In [None]:
# from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
import os
from langchain_openai import OpenAIEmbeddings


# Set your OpenAI API Key
os.environ["OPENAI_API_KEY"] = ""  # Replace with your actual OpenAI API key

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings()

# Convert the split DataFrame into Chroma-compatible format
documents = [
    Document(page_content=row["Chunk"], metadata={
        "Section": row["Section"],
        "Link": row["Link"],
        "Title": row["Title"]
    })
    for _, row in df_split.iterrows()
]

# Store embeddings in Chroma DB
db = Chroma.from_documents(documents, embeddings, persist_directory="chroma_data")

# Persist the database for reuse across sessions
db.persist()

print("Embeddings stored successfully in Chroma DB!")


Embeddings stored successfully in Chroma DB!


In [68]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

# Wrap the vector store for querying
chroma_index = VectorStoreIndexWrapper(vectorstore=db)

# # Example query
# query = "What pathways are available in the UMBC Data Science program?"
# results = chroma_index.query(query)

# print("\nQuery Results:")
# print(results)


In [70]:
groq_api_key=os.environ['groq_api_key']


In [72]:
llm=ChatGroq(groq_api_key=groq_api_key,
         model_name="mixtral-8x7b-32768")

from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.

<context>
{context}
</context>

Question: {input}""")

In [73]:
chroma_index.query("What are the different pathways available list out all",llm=llm)

'Based on the provided context, the following are the different pathways available for students:\n\n1. Cybersecurity Pathway\n2. Economics/Econometrics (Only available at the main campus)\n3. Healthcare Analytics (Only available at the main campus)\n4. Policy Analysis (Only available at the main campus)\n5. Project Management\n6. Management Sciences\n7. Aging Studies (Online)\n8. Advanced Computing and Analytics Pathway (Only available at the main campus)\n9. Bioinformatics (FAES NIH)\n10. Clinical Informatics (with UMB)\n\nThese pathways are designed to allow students who work in a particular domain to take classes specific to their industry.'

In [76]:
chroma_index.query("How early should i begin my opt application process",llm=llm)

"Based on the information provided, you should begin the OPT application process by requesting the OPT I-20 from the ISSS office early. Once you have the OPT I-20, you can then prepare and submit the OPT application to USCIS. It's important to note that OPT applications must be submitted to USCIS no earlier than 90 days before your program end date and no later than 60 days after your program end date. Therefore, you should aim to have your OPT I-20 ready and application prepared within the 90-day window before your program end date. Working with the ISSS office early in the process will help ensure that you meet these deadlines."

In [77]:
chroma_index.query("How do i apply for GA",llm=llm)

'To apply for the program, you should follow these steps:\n\n1. Go to the application website.\n2. Submit your online application.\n3. Include the following documents in your application:\n\t* A statement outlining your goals and expectations in the program, as well as your background and qualifications.\n\t* Your current resume.\n\t* Transcripts from each college or university you have attended.\n\t* Two reference letters.\n\t* If you are an international student, a TOEFL, IELTS, or PTE test score.\n\t* A non-refundable application fee of $50.\n\t* If you are a local student, a completed application including a residency form to determine in-state tuition eligibility.\n\nIncomplete applications will not be reviewed. If you have already started an application, you can finish it up.'

In [79]:
chroma_index.query("What is the syllabus of 603",llm=llm)

'Based on the provided information, I can tell you that DATA 603 is a course that requires some familiarity with big data platforms. If you have such familiarity, you can take it in your first semester, otherwise, it is recommended to take it after completing DATA 601. If you are taking one or two courses per semester, you might want to consider taking DATA 603 after DATA 602. However, the syllabus for DATA 603 is not provided in the information given. I would recommend checking the course catalog or contacting the department directly for detailed course information.'

In [86]:
chroma_index.query("Can I take 602 in my first semester",llm=llm)

'Based on the information provided, if you are familiar with data analysis and machine learning or have an undergraduate degree in Computer Science or Information Systems, you can take DATA 602 concurrently with DATA 601 in your first semester. However, if you are not familiar with these topics, it is recommended to take DATA 602 after completing DATA 601. Additionally, if you are not familiar with big data platforms, it is recommended to take DATA 603 after completing DATA 601. Therefore, taking DATA 602 in your first semester is possible under certain conditions.'

In [88]:
chroma_index.query("How do i apply for ggt",llm=llm)

'To apply for a GGT (graduate graduate stutor) position in the MPS Data Science program, you need to complete Data 601, 602, 603, and 604 and formally apply through the specified page. There is no need to get a recommendation from a faculty member for your application to be considered. Sending personal emails to faculty members, including the directors, will not increase your chances of receiving a GGT position.\n\nYou can find the link to apply on the page that contains the information you provided. Since I cannot share the link directly, I recommend searching for "MPS Data Science GGT application" or a similar query in your web browser. Make sure you are on the official program website before proceeding with your application.'

In [92]:
# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain

# retriever=astra_vector_store.as_retriever()
# document_chain=create_stuff_documents_chain(llm,prompt)
# retrieval_chain=create_retrieval_chain(retriever,document_chain)


from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Assuming db is your Chroma instance
retriever = db.as_retriever()  # Use Chroma's retriever

# Create a documents chain with LLM and prompt
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

# Combine the retriever and document chain into a retrieval chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# Example query
query = "How do i apply for GA?"
response = retrieval_chain.invoke({"input": query})

print("Response:")
print(response)

# # Example query
# query = "What pathways are available in the UMBC Data Science program?"

# # Call the chain
# response = retrieval_chain.invoke({"input": query})

# print("Response:")
# print(response)


Response:
{'input': 'How do i apply for GA?', 'context': [Document(metadata={'Link': 'https://dil.umbc.edu/prospective-students/master-of-professional-studies-in-data-science/', 'Section': 'Prospective Students', 'Title': 'Domestic Applicants'}, page_content='Your application should include the following documents: (Incomplete applications will not be reviewed. If you already have started an application, you can finish up)\n\nStatement outlining your goals and expectations in the program. This also is an opportunity for you to further explain your background and qualifications for the program and provide any additional information about you or your experience which will help with the evaluation of your application.\nCurrent resume (When applying online, upload your resume to the graduate school application.)\nTranscript(s) from each college  university attended\nLocal Students: Completed application including residency form (Residency form determines in-state tuition eligibility)\nInte

In [96]:
def ask_question(query):
    response = retrieval_chain.invoke({"input": query})
    print(f"Query: {query}")
    print(f"Response: {response}\n")

# Ask different questions
ask_question("What pathways are available in the UMBC Data Science program?")
ask_question("Can i take 602 in my first semester?")
ask_question("What electives can I take for Cybersecurity?")
ask_question("How do I apply for OPT?")

Query: What pathways are available in the UMBC Data Science program?
Response: {'input': 'What pathways are available in the UMBC Data Science program?', 'context': [Document(metadata={'Link': 'https://dil.umbc.edu/pathways-and-certificates/list-of-available-pathways-pathway-courses/', 'Section': 'Pathways & Certificates', 'Title': 'List of Available Pathways  Elective Courses'}, page_content='The pathways will allow students who work in a particular domain to take classes specific to their industry. Students are recommended to choose a pathway and take three courses from the same pathway. However, in case of class unavailability, students are welcome to choose courses from different pathways. We offer a bunch of DATA 690 Special Topics classes. Students, who meet the prerequisites of these Special Classes, can take up to 5 Special Topics classes.\n\nElective Data Science Courses\nCybersecurity Pathway\nEconomicsEconometrics (Only at the main campus)\nHealthcare Analytics (Only at the 

In [98]:
pip install fastapi uvicorn

Note: you may need to restart the kernel to use updated packages.


In [100]:
import re

def get_last_updated_date(log_file_path):
    """
    Extract the last updated date from the log file.
    """
    try:
        with open(log_file_path, 'r') as file:
            lines = file.readlines()
        
        # Look for the last date in the logs
        for line in reversed(lines):
            # Assuming log format: "2024-11-16 11:30:00 - INFO - Scraping completed"
            match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', line)
            if match:
                return match.group(0)  # Return the last matching timestamp
        return "Unknown"  # If no date is found
    except Exception as e:
        print(f"Error reading log file: {e}")
        return "Unknown"

# Example Usage
log_file_path = "C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/scraping_log.log"
last_updated_date = get_last_updated_date(log_file_path)
print(f"Last updated date: {last_updated_date}")


Last updated date: 2024-11-17 22:22:13


In [102]:
pip install FastAPI uvicorn

Note: you may need to restart the kernel to use updated packages.


In [None]:
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import json
import re
import nest_asyncio
import uvicorn

app = FastAPI()

# Mock retrieval_chain for testing
class MockRetrievalChain:
    def invoke(self, query):
        mock_responses = {
            "What electives can I take for Cybersecurity?": "You can take courses like Advanced Network Security, Secure Coding, and Cloud Security.",
            "What resources are available for current students?": "Current students have access to academic advising, library resources, and career services.",
            "Who are the faculty members in the Data Science program?": "Faculty members include Dr. Smith, Dr. Johnson, and Dr. Davis.",
            "What are the rules for OPT and CPT?": "OPT allows students to work in the U.S. for up to 12 months post-graduation, while CPT is for internships during the program.",
            "What should prospective students know about the UMBC Data Science program?": "Prospective students should know about program requirements, tuition fees, and career opportunities.",
            "What research opportunities are available in UMBC Data Science?": "Research opportunities include AI, cybersecurity, and health informatics projects."
        }
        return mock_responses.get(query["input"], "No information available.")

# Mock retrieval chain instance
retrieval_chain = MockRetrievalChain()

# Function to extract the last updated date from the log file
def get_last_updated_date(log_file_path):
    try:
        with open(log_file_path, 'r') as file:
            lines = file.readlines()
        for line in reversed(lines):
            match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', line)
            if match:
                return match.group(0)
        return "Unknown"
    except Exception as e:
        print(f"Error reading log file: {e}")
        return "Unknown"

# Function to dynamically handle queries using the retrieval chain
def ask_question(query):
    try:
        response = retrieval_chain.invoke({"input": query})
        print(f"Query: {query}")
        print(f"Response: {response}\n")
        return response
    except Exception as e:
        print(f"Error in ask_question: {e}")
        return "An error occurred while retrieving the response."

# Intent-to-query mapping for dynamic responses
intent_query_mapping = {
    "Courses_and_Electives_Information": "What electives can I take for Cybersecurity?",
    "Current_Student_Information": "What resources are available for current students?",
    "Faculty_Information": "Who are the faculty members in the Data Science program?",
    "OPT_and_CPT_Information": "What are the rules for OPT and CPT?",
    "Prospective_Student_Information": "What should prospective students know about the UMBC Data Science program?",
    "Research_Information": "What research opportunities are available in UMBC Data Science?",
}

@app.post("/webhook")
async def webhook(request: Request):
    try:
        req = await request.json()
        print("Incoming Request:", json.dumps(req, indent=2))

        # Extract the intent name
        intent_name = req.get("queryResult", {}).get("intent", {}).get("displayName", "")
        print(f"Triggered Intent: {intent_name}")

        # Handle the Default Welcome Intent
        if intent_name == "Default Welcome Intent":
            log_file_path = "C:/Users/almas/OneDrive/Desktop/Fall 2024/webscraping/scraping_log.log"
            last_updated_date = get_last_updated_date(log_file_path)
            response_text = (
                f"Hello! I was last updated on {last_updated_date}. "
                "Information might have changed after this date. How can I assist you today?"
            )
            return JSONResponse(content={"fulfillmentText": response_text})

        # Handle intents dynamically using the retrieval chain
        if intent_name in intent_query_mapping:
            query = intent_query_mapping[intent_name]
            response_text = ask_question(query)
            return JSONResponse(content={"fulfillmentText": response_text})

        # Fallback response for unknown intents
        return JSONResponse(content={"fulfillmentText": "I'm sorry, I couldn't process your request."})

    except Exception as e:
        print(f"Error in webhook: {e}")
        return JSONResponse(content={"fulfillmentText": "An error occurred while processing your request."})

# Apply nest_asyncio for Jupyter Notebook compatibility
nest_asyncio.apply()

# Run the FastAPI app
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=5000)


INFO:     Started server process [48416]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:5000 (Press CTRL+C to quit)


Incoming Request: {
  "responseId": "1f1680f3-0359-421f-aa25-683593eec036-c2998605",
  "queryResult": {
    "queryText": "can i take 602 in my first semester",
    "parameters": {
      "unit-length": {
        "amount": 602.0,
        "unit": "inch"
      },
      "ordinal": 1.0
    },
    "allRequiredParamsPresent": true,
    "fulfillmentMessages": [
      {
        "text": {
          "text": [
            ""
          ]
        }
      }
    ],
    "outputContexts": [
      {
        "name": "projects/retreiverbot-sheg/agent/sessions/2a91254e-7727-9427-3440-bb4ec4a68ba6/contexts/__system_counters__",
        "parameters": {
          "no-input": 0.0,
          "no-match": 0.0,
          "unit-length": {
            "amount": 602.0,
            "unit": "inch"
          },
          "unit-length.original": "602 in",
          "ordinal": 1.0,
          "ordinal.original": "first"
        }
      }
    ],
    "intent": {
      "name": "projects/retreiverbot-sheg/agent/intents/3d9f8f75-

In [None]:
# # Run the FastAPI app using nest_asyncio
# nest_asyncio.apply()
# # Run the FastAPI app
# if __name__ == "__main__":
#     uvicorn.run(app, host="0.0.0.0", port=5000)

In [None]:
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

app = FastAPI()

# Initialize LLM (e.g., OpenAI GPT-3)
llm = OpenAI(model="text-davinci-003", temperature=0.7)

# Prompt template for generating responses
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are an assistant for the UMBC Data Science program. Answer questions accurately based on the provided context.

    Context:
    {context}

    Question:
    {question}

    Answer:
    """
)

# Retrieval logic (example)
def retrieve_context(query):
    # Replace with retrieval logic (e.g., querying a vector database)
    # This is a placeholder example
    context = {
        "Courses_and_Electives_Information": "Courses available include DATA 601, DATA 602, and DATA 603.",
        "OPT_and_CPT_Information": "OPT allows international students to work after graduation. CPT is for internships during your studies.",
        "Research_Information": "Research areas include AI, Cybersecurity, and Data Science."
    }
    return context.get(query, "No specific context available.")

# Webhook endpoint
@app.post("/webhook")
async def dialogflow_webhook(request: Request):
    req = await request.json()

    # Extract query and intent from Dialogflow
    query = req.get("queryResult", {}).get("queryText", "")
    intent_name = req.get("queryResult", {}).get("intent", {}).get("displayName", "")

    print(f"Received query: {query}")
    print(f"Detected Intent: {intent_name}")

    # Retrieve context based on the intent
    context = retrieve_context(intent_name)

    # Fill the prompt with the retrieved context and query
    filled_prompt = prompt_template.format(context=context, question=query)

    # Use LLM to generate a response
    response = llm(filled_prompt)

    # Return response to Dialogflow
    return JSONResponse(content={"fulfillmentText": response.strip()})


# Run the FastAPI server
nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=5000)


In [None]:
<script src="https://www.gstatic.com/dialogflow-console/fast/messenger/bootstrap.js?v=1"></script>
<df-messenger
  intent="WELCOME"
  chat-title="RetreiverBot"
  agent-id="ab6278c2-9910-44c2-8601-b908c78f403c"
  language-code="en"
></df-messenger>