# Web Scraping and Data Preprocessing

This notebook contains code related to web scraping and preparing data for export to CSV. The primary focus is on gathering and cleaning the data needed for use in the main application. Most of the work here revolves around preprocessing steps.


In [10]:

"""
This works the best so far in terms of going throuh a random set of links and getting the content. Gets the overview page twice but I just went in mannually and removed the duplicated content.
"""
import os
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urljoin

def setup_driver():
    """Setup and return a headless Chrome browser."""
    print("Setting up headless Chrome browser...")
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
    except Exception as e:
        print(f"Error setting up Chrome driver: {e}")
        return None

def get_program_links(driver, url):
    """Get all program links from the featured programs page."""
    print(f"Loading featured programs page: {url}")
    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(3)
        
        # First attempt - directly get links from the page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        links = soup.find_all('a', href=True)
        
        program_links = []
        for link in links:
            href = link['href']
            if ('ViewProgramAngular' in href or 'ViewProgram' in href) and not href.startswith('javascript:'):
                full_url = urljoin(url, href)
                name = link.get_text(strip=True) or full_url
                program_links.append((name, full_url))
        
        # Second attempt - if no links found, try search functionality
        if not program_links:
            print("No program links found directly. Attempting to use search...")
            try:
                # Look for search button/form
                search_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Search') or @type='submit']")
                search_button.click()
                time.sleep(3)
                
                # Get links after search
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                links = soup.find_all('a', href=True)
                for link in links:
                    href = link['href']
                    if ('ViewProgramAngular' in href or 'ViewProgram' in href) and not href.startswith('javascript:'):
                        full_url = urljoin(url, href)
                        name = link.get_text(strip=True) or full_url
                        program_links.append((name, full_url))
            except Exception as e:
                print(f"Search attempt failed: {e}")
        
        # de-duplicate
        seen = set()
        unique = []
        for name, link_url in program_links:
            if link_url not in seen:
                seen.add(link_url)
                unique.append((name, link_url))
        print(f"Found {len(unique)} unique program links")
        return unique

    except Exception as e:
        print(f"Error getting program links: {e}")
        return []

def extract_tab_content(driver):
    """Extract content from all available tabs on a program page."""
    all_content = ""
    visited_tabs = set()  # Keep track of tabs we've already clicked
    
    try:
        # First capture the page URL and title as they are
        page_url = driver.current_url
        page_title = driver.title
        all_content += f"PAGE URL: {page_url}\nPAGE TITLE: {page_title}\n\n"
        
        # Initial page content without clicking anything
        initial_content = driver.find_element(By.TAG_NAME, "body").text
        all_content += f"INITIAL PAGE CONTENT:\n{'-'*40}\n{initial_content}\n{'-'*40}\n\n"
        
        # Get a comprehensive list of potential tab elements using different strategies
        tab_finding_attempts = [
            # Strategy 1: Standard tab selectors
            lambda: find_tabs_by_selectors(driver),
            
            # Strategy 2: Look for adjacent tabs/links
            lambda: find_tabs_by_structure(driver),
            
            # Strategy 3: Look for anything that might be clickable and change content
            lambda: find_potential_interactive_elements(driver)
        ]
        
        tabs = []
        for attempt_func in tab_finding_attempts:
            if not tabs:  # Only try the next strategy if we haven't found tabs yet
                try:
                    tabs = attempt_func()
                    if tabs:
                        print(f"Found {len(tabs)} tabs using strategy")
                except Exception as e:
                    print(f"Tab finding strategy failed: {e}")
        
        if not tabs:
            print("Warning: No tabs found on this page")
            return all_content
        
        # Process each tab
        for i, tab in enumerate(tabs):
            try:
                # Get tab identifier to avoid revisiting
                tab_id = driver.execute_script("""
                    return arguments[0].textContent + '_' + 
                           (arguments[0].getAttribute('href') || '') + '_' + 
                           (arguments[0].getAttribute('id') || '');
                """, tab)
                
                if tab_id in visited_tabs:
                    print(f"Skipping already visited tab: {tab_id}")
                    continue
                
                # Mark as visited
                visited_tabs.add(tab_id)
                
                # Get tab name/label
                try:
                    label = tab.text.strip()
                except:
                    label = f"Tab {i+1}"
                
                if not label:
                    label = f"Tab {i+1}"
                
                print(f"Processing tab: {label}")
                
                # Scroll to tab
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
                time.sleep(1)
                
                # Remember current state of page
                before_click = driver.find_element(By.TAG_NAME, "body").text
                
                # Click tab
                try:
                    tab.click()
                    time.sleep(2)  # Wait for content to load
                except Exception as e:
                    print(f"  Failed to click tab: {e}")
                    driver.execute_script("arguments[0].click();", tab)
                    time.sleep(2)
                
                # Get new content
                after_click = driver.find_element(By.TAG_NAME, "body").text
                
                # Find content difference (what changed)
                if before_click == after_click:
                    print(f"  No visible content change after clicking tab {label}")
                    tab_content = get_tab_section_content(driver, tab)
                else:
                    tab_content = after_click
                
                # Add to our collected content
                all_content += (
                    f"\n\n{'='*40}\n"
                    f"TAB: {label}\n"
                    f"{'='*40}\n\n"
                    f"{tab_content}\n"
                )
                
            except StaleElementReferenceException:
                print(f"Tab element became stale, refreshing tabs list")
                # Refresh tabs list and continue with next iteration
                for attempt_func in tab_finding_attempts:
                    try:
                        tabs = attempt_func()
                        if tabs:
                            print(f"Refreshed tab list, found {len(tabs)} tabs")
                            break
                    except Exception as e:
                        print(f"Tab refresh failed: {e}")
                
            except Exception as e:
                print(f"Error processing tab {i+1}: {e}")
        
        return all_content

    except Exception as e:
        print(f"Error in extract_tab_content: {e}")
        return f"Error extracting tab content: {str(e)}"

def find_tabs_by_selectors(driver):
    """Find tabs using common CSS/XPath selectors."""
    tab_selectors = [
        "//ul[contains(@class, 'nav-tabs')]/li/a",
        "//div[contains(@class, 'tabs')]/a",
        "//div[contains(@class, 'tab')]/a",
        "//a[contains(@class, 'tab')]",
        "//button[contains(@class, 'tab')]",
        "//a[contains(@data-toggle, 'tab')]",
        "//div[@role='tablist']/button",
        "//div[@role='tab']",
        "//li[contains(@class, 'tab')]/a",
        "//div[contains(@class, 'nav')]/a",
        # Common tab label texts
        "//a[contains(text(), 'Overview') or contains(text(), 'Academics') or contains(text(), 'Requirements') " +
        "or contains(text(), 'Costs') or contains(text(), 'Housing') or contains(text(), 'Dates') " +
        "or contains(text(), 'Locations') or contains(text(), 'Application')]"
    ]
    
    all_tabs = []
    for selector in tab_selectors:
        try:
            elements = driver.find_elements(By.XPATH, selector)
            if elements:
                all_tabs.extend(elements)
        except Exception:
            pass
    
    return all_tabs

def find_tabs_by_structure(driver):
    """Find tabs by looking at common structural patterns."""
    potential_tabs = []
    
    # Look for horizontally aligned links/buttons that might be tabs
    try:
        # Find any ul/ol containing multiple li elements with links
        horizontal_nav = driver.find_elements(
            By.XPATH, 
            "//ul[count(./li) > 1]//a | //ol[count(./li) > 1]//a"
        )
        potential_tabs.extend(horizontal_nav)
        
        # Find elements that are children of the same parent and might be tabs
        siblings = driver.find_elements(
            By.XPATH, 
            "//div[count(./a) > 1]/a | //div[count(./button) > 1]/button"
        )
        potential_tabs.extend(siblings)
    except Exception as e:
        print(f"Error finding tabs by structure: {e}")
    
    return potential_tabs

def find_potential_interactive_elements(driver):
    """Find any clickable elements that might change page content."""
    # This is our fallback strategy for when we can't find obvious tabs
    potential_elements = []
    
    try:
        # Find links in the top navigation area (first 1/3 of page)
        main_content = driver.find_element(By.TAG_NAME, "body")
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(0.5)
        
        # Get viewport height
        viewport_height = driver.execute_script("return window.innerHeight;")
        
        # Find clickable elements in the top portion of the page
        clickables = driver.find_elements(By.XPATH, "//a | //button | //div[@role='button']")
        
        for element in clickables:
            try:
                # Check if element is in the top portion and visible
                location = element.location
                if location['y'] < viewport_height * 0.5:  # Top half of the viewport
                    if element.is_displayed():
                        potential_elements.append(element)
            except:
                continue
                
    except Exception as e:
        print(f"Error finding interactive elements: {e}")
    
    return potential_elements

def get_tab_section_content(driver, tab):
    """Try to find content section associated with a tab."""
    tab_content = ""
    
    try:
        # Try to find related content through common patterns
        
        # Method 1: Check if there's an id attribute and find corresponding content
        tab_id = tab.get_attribute('id')
        tab_href = tab.get_attribute('href')
        aria_controls = tab.get_attribute('aria-controls')
        
        content_element = None
        
        # Try to find content div based on aria-controls
        if aria_controls:
            try:
                content_element = driver.find_element(By.ID, aria_controls)
            except:
                pass
                
        # Try to find content div based on href fragment
        if not content_element and tab_href and '#' in tab_href:
            try:
                fragment = tab_href.split('#')[-1]
                content_element = driver.find_element(By.ID, fragment)
            except:
                pass
        
        # Try to find corresponding content pane based on id pattern
        if not content_element and tab_id:
            try:
                # Common patterns: tab-id -> content-id, tab -> tab-content
                pane_id = tab_id.replace('tab', 'content')
                content_element = driver.find_element(By.ID, pane_id)
            except:
                try:
                    pane_id = tab_id + '-content'
                    content_element = driver.find_element(By.ID, pane_id)
                except:
                    pass
        
        # If found content element, get its text
        if content_element:
            tab_content = content_element.text
        else:
            # If we couldn't find associated content, use visible body content
            tab_content = driver.find_element(By.TAG_NAME, "body").text
            
    except Exception as e:
        print(f"Error finding tab content section: {e}")
        tab_content = "Failed to extract tab content specifically"
    
    return tab_content

def safe_filename(text):
    """Convert text to a safe filename."""
    for ch in ['<','>',':','"','/','\\','|','?','*']:
        text = text.replace(ch, '_')
    return text[:100]

def main(test_mode=True, test_limit=5):
    output_dir = "cal_poly_programs_improved"
    os.makedirs(output_dir, exist_ok=True)
    driver = setup_driver()
    if not driver:
        return
    
    try:
        featured_url = "https://abroad.calpoly.edu/index.cfm?FuseAction=Programs.FeaturedPrograms"
        program_links = get_program_links(driver, featured_url)
        if not program_links:
            return
        
        # If in test mode, only process the specified number of links
        if test_mode:
            print(f"TEST MODE: Only processing {test_limit} links")
            program_links = program_links[:test_limit]
        
        all_file = os.path.join(output_dir, "all_programs.txt")
        with open(all_file, "w", encoding="utf-8") as f:
            f.write(f"{'TESTING MODE' if test_mode else 'FULL RUN'} - {len(program_links)} programs\n\n")
        
        for index, (program_name, program_url) in enumerate(program_links, start=1):
            print(f"\nProcessing program {index}/{len(program_links)}: {program_name}")
            try:
                driver.get(program_url)
                WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                time.sleep(3)
                
                content = extract_tab_content(driver)
                header = f"PROGRAM: {program_name}\nURL: {program_url}\n\n"
                full_text = header + content
                
                fname = safe_filename(program_name) + ".txt"
                path = os.path.join(output_dir, fname)
                with open(path, "w", encoding="utf-8") as f:
                    f.write(full_text)
                with open(all_file, "a", encoding="utf-8") as f:
                    f.write("\n\n" + "="*80 + "\n\n" + full_text)
                
                print(f"Saved program to {path}")
                time.sleep(2)
            except Exception as e:
                print(f"Error processing program {program_name}: {e}")
                with open(os.path.join(output_dir, "errors.txt"), "a", encoding="utf-8") as f:
                    f.write(f"Failed on {program_name} ({program_url}): {str(e)}\n")
        
        print(f"\nDone! Check {output_dir}/ and {all_file}")
    finally:
        driver.quit()

if __name__ == "__main__":
    # Default to test mode with 5 links
    test_mode = False
    test_limit = 5
    
    print(f"Running in TEST MODE - processing {test_limit} links")
    print("To process all links, change test_mode to False in the code")
    
    main(test_mode=test_mode, test_limit=test_limit)

Running in TEST MODE - processing 5 links
To process all links, change test_mode to False in the code
Setting up headless Chrome browser...
Loading featured programs page: https://abroad.calpoly.edu/index.cfm?FuseAction=Programs.FeaturedPrograms
Found 91 unique program links

Processing program 1/91: CSUIP Canada: Concordia University
Found 6 tabs using strategy
Processing tab: Overview
  No visible content change after clicking tab Overview
Processing tab: Academics
Processing tab: Cal Poly Minimum Requirements
Processing tab: Costs
Processing tab: Tab 5
  Failed to click tab: Message: element not interactable
  (Session info: chrome=136.0.7103.48)
Stacktrace:
0   chromedriver                        0x0000000102ebe924 cxxbridge1$str$ptr + 2825408
1   chromedriver                        0x0000000102eb6b6c cxxbridge1$str$ptr + 2793224
2   chromedriver                        0x00000001029f58d0 cxxbridge1$string$len + 89728
3   chromedriver                        0x0000000102a3dc9c cxxbri

Removing redundant text from each .txt file

In [3]:
import os
import re

def remove_last_tab_section(text):
    """
    Removes the last TAB: Tab N section (including header and everything after) from the text.
    """
    pattern = r"(=+\s*\nTAB: Tab \d+\s*\n=+\s*)"
    matches = list(re.finditer(pattern, text))
    if matches:
        last_tab_start = matches[-1].start()
        return text[:last_tab_start].strip()
    return text.strip()

# === Paths ===
input_folder = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cal_poly_programs_improved"
output_folder = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cleaned_programs"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# === Clean and save ===
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        with open(input_path, 'r', encoding='utf-8') as f:
            raw_text = f.read()

        cleaned_text = remove_last_tab_section(raw_text)

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

print("✅ All cleaned files saved to:", output_folder)


✅ All cleaned files saved to: /Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cleaned_programs


Next step:
Embeddings

First get all majors accepted from the 91 programs in a dataframe

In [94]:
import os
import re
import pandas as pd
import numpy as np

def extract_areas_of_study_to_df(folder_path):
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()

                # Extract line starting with "Areas of Study"
                match = re.search(r"Areas of Study\s*([A-Za-z0-9,&\-\/ ]+(?:,\s*[A-Za-z0-9,&\-\/ ]+)+)", content)
                if match:
                    areas_string = match.group(1).strip()
                    cleaned_string = ", ".join([area.strip() for area in areas_string.split(",")])
                    num_areas = len(cleaned_string.split(", "))
                else:
                    cleaned_string = np.nan
                    num_areas = 0

                data.append({
                    "filename": filename,
                    "areas_of_study": cleaned_string,
                })

    df = pd.DataFrame(data)
    return df

# === Run It ===
folder = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cleaned_programs"
areas_of_study = extract_areas_of_study_to_df(folder)

Manually mapping the majors to some of the programs which didn't work above

In [96]:

data = """
Cal Poly Global Program_ Cal Poly in Japan_ Architecture (Fall).txt : Architecture
CSUIP Germany_ Biberach University of Applied Sciences.txt : Architecture
Cal Poly Exchange with Hochschule München University of Applied Sciences (Munich) - BUSINESS.txt : Business Administration
Cal Poly Exchange with Lucerne University of Applied Sciences (ARCHITECTURE).txt : Architecture
CSUIP Germany_ Trossingen University of Music.txt : Music
CSUIP Italy_ Accademia di Belle Arti di Firenze.txt : Italian
Cal Poly Global Program_ Cal Poly in Costa Rica_ Agriculture & Conservation (Summer).txt : Not Specified
Cal Poly Global Program_ Cal Poly in Rome_ Software Engineering (Fall).txt : Software Engineering
Cal Poly Global Program_ Cal Poly in Lithuania_ Architectural Engineering (Summer).txt : Architectural Engineering
Cal Poly Global Program_ Cal Poly in Japan_ Architecture (Spring).txt : Architecture
CSUIP France_ Language & Culture at Aix-Marseille Université.txt : French
CSUIP Germany_ Freiburg University of Education.txt : German
CSUIP Mexico_ Tecnológico de Monterrey.txt : Not Specfied
Cal Poly in South Korea_ Architecture (Spring).txt : Architecture
Cal Poly Global Program_ Cal Poly in Europe_ Wine & Viticulture (Summer).txt : Wine and Viticulture
Cal Poly Exchange with L'Ecole d'Architecture de Paris-Val-de-Seine (CAED).txt : Architecture ,Other
Cal Poly Global Program_ Cal Poly in Switzerland & Finland_ Architecture (Summer).txt : Architecture
Cal Poly Global Program_ Cal Poly in Prague_ Construction Management (Summer).txt : Construction Managment
Cal Poly Global Program_ Cal Poly in England & Scotland_ Literature (Summer).txt : English, Other
Cal Poly Exchange with Hochschule Munchen University of Applied Sciences (Munich) - ARCHITECTURE.txt : Architecture
Cal Poly Global Program_ Cal Poly in Rome_ Architecture (Fall).txt : Architecture
"""

lines = [line.strip() for line in data.strip().split("\n")]
parsed = [line.split(" : ") for line in lines]

NaNs = pd.DataFrame(parsed, columns=["filename", "areas_of_study"])


In [97]:
areas_of_study.set_index("filename", inplace=True)
NaNs.set_index("filename", inplace=True)
areas_of_study["areas_of_study"] = areas_of_study["areas_of_study"].fillna(NaNs["areas_of_study"])
areas_of_study.reset_index(inplace=True)


This next function gives a summary of all the information for each program to be embedded for more accurate extration.

In [113]:
import os
import pandas as pd
from langchain.chat_models import BedrockChat
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
load_dotenv()
aws_client = boto3.client("bedrock-runtime", region_name="us-west-2")
# === Claude model config ===
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
model_kwargs = {
    "max_tokens": 2048,
    "temperature": 0.0,
    "top_k": 250,
    "top_p": 0.9,
    "stop_sequences": ["\n\nHuman"],
}

# Initialize Claude via AWS Bedrock
model = BedrockChat(
    client=aws_client,
    model_id=model_id,
    model_kwargs=model_kwargs,
)

# Prompt template for summarizing
messages = [
    ("system", "You are a helpful assistant that summarizes Cal Poly study abroad program information for students."),
    ("human", "Please summarize the following program into a helpful and complete overview of what a student should know:\n\n{program_text}")
]
prompt = ChatPromptTemplate.from_messages(messages)
chain = prompt | model | StrOutputParser()

# Path to your folder
folder_path = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cal_poly_programs5_text"

# Collect summaries
summaries = []

for file in os.listdir(folder_path):
    if file.endswith(".txt"):
        file_path = os.path.join(folder_path, file)
        with open(file_path, "r", encoding="utf-8") as f:
            program_text = f.read().strip()
        
        print(f"Summarizing: {file}")
        try:
            summary = chain.invoke({"program_text": program_text})
        except Exception as e:
            summary = f"Error: {e}"
        
        summaries.append({
            "filename": file,
            "summary": summary
        })

# Save to DataFrame
summary_df = pd.DataFrame(summaries)


  model = BedrockChat(


Summarizing: CSUIP France_ ESSCA School of Management.txt
Summarizing: CSUIP Taiwan_ National Taiwan University.txt
Summarizing: Cal Poly Exchange with Hochschule München University of Applied Sciences (Munich) - GRAPHIC COMMUNIC.txt
Summarizing: Cal Poly Exchange with KTH Royal Institute of Technology, Stockholm (CENG).txt
Summarizing: CSUIP Japan_ University of Tsukuba.txt
Summarizing: CSUIP United Kingdom_ Swansea University.txt
Summarizing: First Year GO (Global Opportunities) Program.txt
Summarizing: Cal Poly Global Program_ Cal Poly in Japan_ Architecture (Fall).txt
Summarizing: CSUIP Spain_ Universidad de Granada.txt
Summarizing: CSUIP Chile_ Pontificia Universidad Catolica de Chile.txt
Summarizing: CSUIP Germany_ Biberach University of Applied Sciences.txt
Summarizing: CSUIP Germany_ Reutlingen University.txt
Summarizing: CSUIP Italy_ CSU Florence Center.txt
Summarizing: CSUIP Germany_ Ludwigsburg University of Education.txt
Summarizing: Cal Poly Exchange with Rikkyo University

This function splits the various sections for each study abroad program like costs, academics ect. Reason being will allow for embeddings becasue each .txt files is too large to be used for one chunk and embedding. Now each section will act as a seperate chunk to be embedded.

In [125]:
import os
import pandas as pd

def parse_tabs_from_txt(file_path):
    """Extract tab name and content sections from a single .txt file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    tabs = []
    current_tab = None
    current_content = []

    for line in lines:
        if line.startswith("TAB:"):
            if current_tab:
                tabs.append((current_tab.strip(), ''.join(current_content).strip()))
            current_tab = line.replace("TAB:", "").strip()
            current_content = []
        elif line.startswith("==="):
            continue
        else:
            current_content.append(line)

    # Add the last captured tab
    if current_tab and current_content:
        tabs.append((current_tab.strip(), ''.join(current_content).strip()))

    return tabs

def create_long_format_df_from_folder(folder_path):
    """Create long-format dataframe with columns: program_name, tab_name, content"""
    records = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            full_path = os.path.join(folder_path, file_name)
            try:
                tab_sections = parse_tabs_from_txt(full_path)
                for tab, content in tab_sections:
                    records.append({
                        "program_name": file_name,
                        "tab_name": tab,
                        "content": content
                    })
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    return pd.DataFrame(records)

# Example usage (you can replace with your folder):
# df_long = create_long_format_df_from_folder("/your/folder/path/here")


In [126]:
df_long = create_long_format_df_from_folder("/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cleaned_programs")

In [134]:
df_long.rename(columns={"program_name": "filename"}, inplace=True)

In [130]:
merge = areas_of_study.merge(summary_df, on='filename', how='inner')

Unnamed: 0,filename,areas_of_study,summary
0,CSUIP France_ ESSCA School of Management.txt,"Business Administration, French","Here is a summary of the Cal Poly CSUIP France: ESSCA School of Management study abroad program:\n\nOverview:\n- This is a study abroad program through the CSU International Programs (CSUIP) system, allowing Cal Poly students to study at ESSCA School of Management in Aix-en-Provence, France.\n- ESSCA is one of France's leading business schools with 10 campuses worldwide. The Aix-en-Provence campus is newly opened.\n- Students can earn credit towards their major, minor, and GE requirements by taking business fundamentals courses like management, finance, marketing, etc. French language courses are also available.\n- Instruction is in English and French.\n- Students pay regular Cal Poly tuition and fees.\n\nAcademics:\n- Courses can fulfill GE, major, and minor requirements with appropriate approval. Pre-approved courses are listed, others require petitioning.\n- Students should verify prerequisites before enrolling in courses.\n\nEligibility:\n- Cal Poly students must meet a minimum 2.75 GPA requirement in addition to university-wide eligibility requirements for study abroad.\n\nCosts:\n- Students pay regular Cal Poly tuition to CSUIP.\n- Additional costs like housing, meals, etc. are paid directly - refer to the CSUIP website for estimates.\n- All approved students must pay a non-refundable Cal Poly international center fee.\n- CSU-mandated international insurance is required and automatically charged.\n\nLet me know if you need any other details about this ESSCA business program in the south of France!"
1,CSUIP Taiwan_ National Taiwan University.txt,"Anthropology and Geography, Asian Studies, Biology, Business Administration, Chemistry, Chinese, Economics, Engineering, Environmental Studies, Liberal Arts, Linguistics, Literature, Physics, Political Science, Psychology","Here is a summary of the Cal Poly study abroad program at National Taiwan University in Taiwan:\n\nOverview:\n- This is a study abroad program through the CSU International Programs system, allowing Cal Poly students to study at National Taiwan University (NTU) in Taipei.\n- NTU is ranked as one of the top 100 universities globally and top 25 in Asia. It has over 31,000 students.\n- The program allows students to earn major, minor, and GE credits while paying Cal Poly tuition.\n- Courses are offered in Chinese and English. All students must take a Mandarin Chinese language course.\n- Students with less than 1 year of college-level Mandarin can take courses in English in their major field.\n- Students with near-native Mandarin fluency can take regular university courses in Chinese or English.\n\nAcademics:\n- Courses can fulfill GE, major, and minor requirements with appropriate approval through petition process.\n- See the program website for list of pre-approved course equivalencies.\n\nRequirements:\n- Students must meet Cal Poly and CSU International Programs minimum eligibility requirements, including 3.0 GPA.\n- No language prerequisite required.\n\nCosts:\n- Students pay Cal Poly tuition rate.\n- Additional costs for housing, meals, airfare, books, etc. Apply CSU-mandated study abroad insurance.\n- Non-refundable Cal Poly international center fee required.\n\nThe program highlights NTU's top global ranking, Taipei's safety, and the opportunity to gain international experience valued by employers. Students can take major classes in English while immersing in Chinese language and culture."
2,Cal Poly Exchange with Hochschule München University of Applied Sciences (Munich) - GRAPHIC COMMUNIC.txt,"Graphic Communication, Liberal Arts","Here is a helpful summary of the Cal Poly study abroad program at Hochschule München University of Applied Sciences (Munich) for Graphic Communications students:\n\nOverview:\n- Exchange program where Cal Poly students directly enroll at the Munich university for up to 1 year\n- Take Graphic Communications courses taught in English\n- Can study abroad for a semester (fall or spring) or full academic year\n\nAcademics:\n- Take courses in the Graphic Communications department \n- Get courses pre-approved for major, minor, or GE credit before applying\n- 1 ECTS credit at Munich = 0.75 Cal Poly quarter units\n\nAdmissions:\n- Open to 3rd or 4th year Graphic Communications students\n- Minimum 2.5 GPA required\n- Two-part application process - apply to Cal Poly first, then to Munich if approved\n\nLocation:\n- Study in the beautiful and safe city of Munich, Germany\n- Hochschule München is located in the heart of Munich\n- Immerse yourself in the rich culture, activities, and travel opportunities\n\nProgram Details:\n- Housing arranged separately, limited student housing available\n- Fall semester runs Oct-Jan, spring semester Mar-July\n- Get involved with student clubs, activities on campus\n\nCosts:\n- Pay Cal Poly tuition and fees \n- Additional costs for housing, meals, travel, insurance, etc.\n- Estimated total around $12,000 for a semester\n\nStudent Testimonials:\n- Hear firsthand from Cal Poly students about their amazing experiences studying abroad in Munich!\n\nLet me know if you need any other details about this graphic communications exchange program in Munich!"
3,"Cal Poly Exchange with KTH Royal Institute of Technology, Stockholm (CENG).txt","Biomedical Engineering, Civil Engineering, Computer Engineering, Computer Science, Electrical Engineering, General Engineering, Industrial Engineering, Manufacturing Engineering, Materials Engineering, Mechanical Engineering, Software Engineering","Here is a helpful summary of the Cal Poly Exchange Program with KTH Royal Institute of Technology in Stockholm for engineering students:\n\nOverview:\n- Exchange program allows engineering students to directly enroll at KTH in Stockholm for 1-2 quarters or a full year\n- Immersive study abroad experience, students take courses alongside Swedish students\n- Around 8-10 Cal Poly students participate each year\n- KTH is one of Europe's top technical universities, founded in 1827\n\nAcademics:\n- Students take engineering courses taught in English at KTH\n- 1 ECTS credit = 0.75 Cal Poly quarter units\n- Major/support courses can get pre-approved by CENG advisors\n- Limited GE course options due to KTH's course unit values\n- Students work with advisors to get courses approved to transfer\n\nAdmissions:\n- Open to CENG students with 2.5+ GPA, good academic/disciplinary standing\n- Two-part application - first to Cal Poly, then to KTH if approved\n- Yearlong applicants preferred\n\nDates/Housing:\n- Fall semester (Aug-Jan) or Spring semester (Jan-June) options\n- On-campus student housing available through KTH housing office\n- Activities/clubs offered through KTH's campuses\n\nCosts:\n- Pay Cal Poly tuition + program fees around $4,700\n- Additional estimated costs like housing, meals, flights around $10,300\n- Financial aid applies for eligible students\n\nLet me know if you need any other details summarized!"
4,CSUIP Japan_ University of Tsukuba.txt,"Anthropology and Geography, Asian Studies, Biology, Chemistry, Economics, Engineering, Japanese, Marine Science, Philosophy, Physics","Here is a summary of the Cal Poly CSUIP Japan: University of Tsukuba study abroad program:\n\nOverview:\n- This is a study abroad program through the CSU International Programs (CSUIP) system at the University of Tsukuba in Japan.\n- The University of Tsukuba is one of the oldest and largest national universities in Japan, known for scientific research. It has over 3,000 international students.\n- Students focus on taking Japanese language courses through the International Student Center, supplemented by elective courses taught in English in areas like culture, international relations, biology, psychology, etc.\n- Classes are available to earn credit towards major, minor, and GE requirements.\n- Tuition is the same as at the student's home CSU campus.\n\nAcademics:\n- Courses can fulfill GE, major, and minor requirements with appropriate approval. Pre-approved courses are listed that don't need petitions.\n- For non-approved courses, students must submit course substitution forms.\n\nCal Poly Requirements:\n- Students must meet Cal Poly's minimum eligibility requirements for study abroad in addition to any program-specific requirements set by CSUIP.\n\nCosts:\n- Students pay CSU tuition directly to CSUIP.\n- Additional costs like housing, meals, etc. are paid to the program - refer to CSUIP's website for full budget details.\n- All approved students must pay Cal Poly's non-refundable International Center fee.\n- CSU-mandated international insurance is required and automatically charged."
...,...,...,...
86,Cal Poly Exchange with Hochschule Munchen University of Applied Sciences (Munich) - ARCHITECTURE.txt,Architecture,"Here is a helpful summary of the Cal Poly study abroad program at Hochschule München University of Applied Sciences in Munich, Germany for architecture students:\n\nOverview:\n- This is a Cal Poly international exchange program where architecture students directly enroll at Hochschule München (HM) for an academic year (October-July).\n- Students take architecture courses at HM to fulfill major requirements while being immersed in Munich.\n- It is a highly independent program ideal for students who can take initiative in the study abroad process.\n\nAcademics:\n- Students take courses from HM's regularly offered courses in English, which are pre-approved to transfer to Cal Poly.\n- Most courses will likely apply to the architecture major, as most general education courses may not meet the 4-unit requirement.\n- Students must petition courses not already pre-approved by submitting course syllabi for evaluation.\n\nAdmissions:\n- Open only to 4th year architecture majors nominated by Cal Poly's Architecture Department.\n- Minimum 2.5 GPA, good academic/disciplinary standing required.\n- Two-step application process - first apply to Cal Poly, then to HM if approved.\n\nLocation:\n- Study in the heart of Munich, ranked among the world's highest quality of living.\n- Safe city combining traditional charm with modern amenities and easy access to cultural activities.\n\nProgram Details:\n- Students arrange their own housing, often in dormitories or apartments.\n- Immersive experience with German student life and opportunities to travel in Europe.\n\nCosts:\n- Estimate around $27,000 for an academic year, including Cal Poly tuition, housing, meals, travel, insurance, and other living expenses.\n\nOverall, this program allows architecture students to directly enroll at a prestigious German university in the vibrant city of Munich for a full academic year cultural and academic immersion."
87,Cal Poly Global Program_ Cal Poly in Rome_ Architecture (Fall).txt,Architecture,"Here is a summary of the Cal Poly Global Program: Cal Poly in Rome: Architecture (Fall) for students:\n\nOverview:\n- This is a study abroad program offered by Cal Poly where students can study architecture in Rome, Italy during the fall quarter/semester.\n\nLocation:\n- The program takes place in Rome, the capital city of Italy known for its rich history, architecture, art, and culture.\n\nAcademics:\n- Architecture students take Cal Poly courses taught by Cal Poly faculty while abroad in Rome. Courses focus on topics like architectural design, history, theory, and more with an emphasis on experiencing and studying the architecture of Rome first-hand.\n- Students are integrated into the local culture through activities, field trips, etc.\n\nAdmissions:\n- The program is open to Cal Poly architecture students who meet certain requirements regarding class standing, GPA, prerequisites, etc.\n- There is an application process to be admitted to the study abroad program.\n\nCosts:\n- In addition to normal Cal Poly tuition, there are study abroad fees that cover housing, some meals, field trips, insurance, etc.\n- Financial aid can apply towards the program costs.\n- Additional personal expenses like airfare, food, personal travel, etc. are the responsibility of the student.\n\nOverall, this rigorous architecture program allows Cal Poly students to live and study in the architecturally rich city of Rome for an immersive learning experience."
88,CSUIP Ghana_ University of Ghana.txt,"Agriculture, Anthropology and Geography, Biochemistry, Biology, Business Administration, Chemistry, Earth Science, Economics, Engineering, English, Environmental Studies, Health Science, History, Liberal Arts, Linguistics, Literature, Marine Science, Music, Philosophy, Physics, Political Science, Psychology, Public Health, Religion/Theology, Social Sciences, Sociology, Zoology","Here is a summary of the Cal Poly CSUIP Ghana: University of Ghana study abroad program:\n\nOverview:\n- This is a study abroad program through the CSU International Programs (CSUIP) system at the University of Ghana in Accra.\n- The University of Ghana is one of the top universities in West Africa with around 40,000 students.\n- Students can study for a semester or full academic year.\n- All instruction is in English.\n- Areas of study include agriculture, business, sciences, liberal arts, and more.\n- Housing is in university dormitories.\n\nAcademics:\n- Students can earn credits towards their major, minor, and GE requirements.\n- Required courses include Society and Culture of Ghana, Twi language, and a service-learning course.\n- Other courses are taken from the regular university offerings.\n\nRequirements:\n- Cal Poly minimum 2.75 GPA requirement\n- No language prerequisite\n- Must meet general Cal Poly and CSUIP eligibility requirements\n\nCosts:\n- Students pay standard CSU tuition to CSUIP\n- Additional costs for housing, meals, etc. - check CSUIP website for details\n- Required to purchase CSU international health insurance\n\nThe program allows students to gain international experience and understanding of Ghanaian culture and society through integrated coursework, service-learning, and living in Accra. Students should review all requirements and costs carefully when applying."
89,CSUIP Japan_ Waseda University.txt,"Asian Studies, Communication Studies, Economics, History, Japanese, Liberal Arts, Linguistics, Literature, Political Science, Sociology","Here is a summary of the Cal Poly CSUIP Japan: Waseda University study abroad program:\n\nOverview:\n- This is a CSU system-wide international exchange program where students study at Waseda University in Tokyo, Japan for an academic year (September-August).\n- Students enroll in one of three schools at Waseda: Center for Japanese Language (CJL), School of International Liberal Studies (SILS), or School of Political Science and Economics (PSE).\n- The CJL program focuses primarily on intensive Japanese language study. SILS offers courses in English across liberal arts subjects plus Japanese language. PSE has courses for political science, international relations, and economics majors.\n- A CSU program coordinator provides support services on-site in Tokyo.\n\nAcademics:\n- Students are required to take the equivalent of 15 CSU units per semester.\n- Courses can potentially fulfill GE, major, or minor requirements with appropriate approval through Cal Poly's course substitution process.\n- Students should refer to the program website for specific course offerings each term.\n\nEligibility:\n- Minimum 3.0 GPA required for PSE and SILS programs, 2.75 GPA for CJL program.\n- Must be sophomore, junior or senior standing.\n- 1 term of college-level Japanese language recommended.\n\nCosts:\n- Students pay standard CSU campus tuition.\n- Additional costs for housing, meals, airfare, etc. Apply standard Cal Poly international program fees.\n\nHousing:\n- Homestays with Japanese families or off-campus dorms arranged through Waseda University.\n\nLocation:\n- Program is based at Waseda's campus in central Tokyo, providing access to Japan's capital and largest city.\n\nLet me know if you need any other details summarized!"


In [131]:
long_merge = merge.melt(id_vars=["filename"], value_vars=["summary","areas_of_study"], var_name="tab_name", value_name="content")

In [137]:
final_merge = pd.concat([long_merge, df_long], axis=0, ignore_index=True)


Extracting a hyperlink for each program so the students can click it and go directly to the Cal Poly sight if they want to see other information.

In [140]:
import os

def extract_urls_from_second_line(folder_path):

    urls = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                if len(lines) >= 2 and lines[1].startswith("URL:"):
                    url = lines[1].strip().replace("URL:", "").strip()
                    urls[filename] = url
                else:
                    urls[filename] = None  # No URL found on line 2

    return urls

In [141]:
urls = extract_urls_from_second_line("/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cleaned_programs")

In [329]:
urls["filename"] = urls["filename"].str.replace(".txt", "", regex=False)

In [331]:
urls.drop(columns=["tab_name"], inplace=True)

In [332]:
urls.to_csv("urls.csv", index=False)

In [143]:
urls = pd.DataFrame(urls.items(), columns=["filename", "url"])

In [148]:
urls = urls.melt(id_vars=["filename"], value_vars=["url"], var_name="tab_name", value_name="content")

In [171]:
final_merge2 = pd.concat([final_merge, urls], axis=0, ignore_index=True)  # ignore_index resets the index


In [172]:
final_merge2["filename"] = final_merge2["filename"].str.replace(".txt", "", regex=False)

Embedding fucntion using Titan

In [275]:
import boto3

bedrock = boto3.client('bedrock-runtime', region_name='us-west-2')

def get_titan_embedding(text):
    response = bedrock.invoke_model(
        modelId='amazon.titan-embed-text-v1',
        body=json.dumps({
            "inputText": text
        }),
        contentType='application/json'
    )
    result = json.loads(response['body'].read())
    return result['embedding']


In [175]:
final_merge2['embedding'] = final_merge2['content'].apply(get_titan_embedding)


In [177]:
final_merge2 = final_merge2[final_merge2['tab_name'] != "areas_of_study"]

In [178]:
areas_of_study["filename"] = areas_of_study["filename"].str.replace(".txt", "", regex=False)

In [179]:
final_merge2 = final_merge2.merge(areas_of_study, on='filename', how='inner')

In [181]:
final_merge2.to_csv("cal_poly_embeddings.csv", index=False)

In [163]:
def get_unique_majors(df):
    area_df = df[df['tab_name'].str.lower() == 'areas_of_study']
    major_splits = area_df['content'].str.split(',|\n|;')
    majors = [m.strip() for sublist in major_splits.dropna() for m in sublist if m.strip()]
    return sorted(set(majors))

all_majors = get_unique_majors(final_merge2)

All code from this point on is not relevant to the working dashboard which was submitted. This would be considered future implementations of the study abroad assistant which would be to create a knowlege base of as many question and answer pairs which would already exist. This would significanlty reduce latency time and when a user quereies the only backend work would be embedding their query and computing cosine similarity to the pre-existing q/a pairs and returning the answer of the most similar question. This is highly efficient becasue it does not call any LLM so both costs and latency times would be significantly reduced.

In [None]:
import os
import shutil
import boto3
import pandas as pd
from pathlib import Path
import json

# === Claude Call ===
def call_claude(client, program_content, filename, num_questions):
    prompt = f"""Generate {num_questions} realistic questions and answers about this Cal Poly study abroad program.

Program file: {filename}
Program content: {program_content}

For each question, provide:
1. A natural question a student would ask
2. A detailed answer based on the content
3. Metadata: a short 2–4 word descriptor (e.g., "Cost Info", "Housing Options", "Language Requirements") to describe the question
4. Make sure to put the program namne in the question, so it is clear which program the question is about. Do this for every question.

Format as: Q: [question] | M: [metadata] | A: [answer]

Generate {num_questions} Q&A pairs:"""

    body = json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2048,
        "messages": [{"role": "user", "contsent": prompt}],
        "temperature": 0.3
    })

    response = client.invoke_model(
        body=body,
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        accept="application/json",
        contentType="application/json"
    )

    response_body = json.loads(response.get('body').read())
    content = response_body.get('content', [{}])[0].get('text', '')
    return parse_response(content)

# === Parse Claude's Response ===
def parse_response(text):
    qa_pairs = []
    lines = text.split('\n')
    for line in lines:
        if 'Q:' in line and 'M:' in line and 'A:' in line:
            try:
                parts = line.split('|')
                question = parts[0].replace('Q:', '').strip()
                metadata = parts[1].replace('M:', '').strip()
                answer = parts[2].replace('A:', '').strip()
                
                # Append metadata tag at the end of the question
                question_with_tag = f"{question}  # {metadata}"

                qa_pairs.append({
                    'question': question_with_tag,
                    'answer': answer
                })
            except Exception as e:
                print(f"Failed to parse: {line} - {e}")
                continue
    return qa_pairs

# === Process One File ===
def process_file(file_path, bedrock_client, questions_per_program):
    filename = file_path.name
    print(f"Processing: {filename}")
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    try:
        qa_pairs = call_claude(bedrock_client, content, filename, questions_per_program)
    except Exception as e:
        print(f"Error generating Q&A for {filename}: {e}")
        return []

    return [{
        'filename': filename,
        'question': qa['question'],
        'answer': qa['answer']
    } for qa in qa_pairs]

# === Main Function (No CSV) ===
def process_batch_to_dataframe(source_dir, processed_dir, questions_per_program=20, batch_size=10):
    bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2")
    os.makedirs(processed_dir, exist_ok=True)

    txt_files = list(Path(source_dir).glob("*.txt"))
    batch_files = txt_files[:batch_size]

    if not batch_files:
        print("No unprocessed files found.")
        return pd.DataFrame()

    all_records = []

    for file_path in batch_files:
        records = process_file(file_path, bedrock_client, questions_per_program)
        all_records.extend(records)
        shutil.move(str(file_path), os.path.join(processed_dir, file_path.name))

    return pd.DataFrame(all_records)

# === Run and Assign to `batch2` ===
if __name__ == "__main__":
    source_folder = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/processed_programs"
    processed_folder = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cleaned_programs"

    batch6 = process_batch_to_dataframe(
        source_dir=source_folder,
        processed_dir=processed_folder,
        questions_per_program=20,
        batch_size=2
    )


Processing: Cal Poly Exchange with TECNUN, University of Navarra, Spain (CENG).txt
Processing: Cal Poly Global Program_ Cal Poly in Rome_ Mechanics.txt


The above function does not ask some common questions so the below function is calling claude to ask and answer specific highly common questions that students would ask when exploring various study abroad programs.

In [296]:
import os
import shutil
import boto3
import pandas as pd
from pathlib import Path
import json

# === CUSTOM QUESTIONS WITH METADATA ===
CUSTOM_QUESTIONS_WITH_METADATA = [
    ("What courses can I take in the {program_name} program?", ["Courses", "Academics", "Subjects"]),
    ("What's the housing situation like for the {program_name} program? Do I live on campus or off campus?", ["Housing", "Living", "Logistics"]),
    ("What language are the classes taught in for the {program_name} program?", ["Language", "Instruction", "Academics"]),
    ("How long can I study abroad in the {program_name} program - just a semester or a full year?", ["Duration", "Term Length", "Timeline"]),
    ("What's the estimated total cost for the {program_name} program?", ["Cost", "Budget", "Finances"]),
    ("What are the basic requirements to be eligible for the {program_name} program?", ["Eligibility", "Requirements", "Admission"]),
    ("What kind of activities and student life are available in the {program_name} program?", ["Student Life", "Activities", "Experience"]),
    ("Where exactly is the {program_name} program located and what's the area like?", ["Location", "Environment", "City Info"]),
    ("Will my credits from the {program_name} program transfer back to Cal Poly?", ["Credits", "Transferability", "Academics"]),
    ("What majors or areas of study does the {program_name} program support?", ["Majors", "Departments", "Fields of Study"]),
    ("Do I need to get any special visas or documentation for the {program_name} program?", ["Visas", "Travel Docs", "Requirements"]),
    ("What's included in the {program_name} program fee vs. what do I pay extra for?", ["Fees", "Inclusions", "Finances"]),
    ("When does the academic year/semester start and end for the {program_name} program?", ["Calendar", "Dates", "Timeline"]),
    ("Are there any faculty coordinators I can talk to about the {program_name} program?", ["Advisors", "Contacts", "Support"]),
    ("What's the campus like for the {program_name} program - facilities, technology, recreation?", ["Campus", "Facilities", "Environment"])
]

# === Claude Call with Fixed Questions ===
def call_claude(client, program_content, filename):
    program_name = Path(filename).stem.replace("_", " ")

    prompt = f"""You are a helpful assistant answering questions about Cal Poly study abroad programs.

Program file: {filename}
Program name: {program_name}
Program content:
{program_content}

Below are specific questions students might ask. Answer each one using only the content of the program. If there isn't enough information to answer accurately, respond with "Information not available."

For each question, respond with:
Q: [question]
A: [detailed answer]

Questions:"""

    for q, _ in CUSTOM_QUESTIONS_WITH_METADATA:
        prompt += f"\n- {q.format(program_name=program_name)}"

    body = json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 4096,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.3
    })

    response = client.invoke_model(
        body=body,
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        accept="application/json",
        contentType="application/json"
    )

    response_body = json.loads(response.get('body').read())
    content = response_body.get('content', [{}])[0].get('text', '')
    return parse_response_fixed(content)

# === Response Parser ===
def parse_response_fixed(text):
    qa_pairs = []
    current_question = ""
    current_answer = ""

    lines = text.split('\n')
    for line in lines:
        if line.strip().startswith("Q:"):
            if current_question and current_answer:
                qa_pairs.append({
                    "question": current_question.strip(),
                    "answer": current_answer.strip()
                })
            current_question = line.replace("Q:", "").strip()
            current_answer = ""
        elif line.strip().startswith("A:"):
            current_answer = line.replace("A:", "").strip()
        else:
            current_answer += " " + line.strip()

    if current_question and current_answer:
        qa_pairs.append({
            "question": current_question.strip(),
            "answer": current_answer.strip()
        })

    return qa_pairs

# === Process One File ===
def process_file(file_path, bedrock_client):
    filename = file_path.name
    print(f"Processing: {filename}")
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    try:
        qa_pairs = call_claude(bedrock_client, content, filename)
    except Exception as e:
        print(f"Error generating Q&A for {filename}: {e}")
        return []

    # Merge question with metadata tags inline
    records = []
    for (qa, (_, metadata_tags)) in zip(qa_pairs, CUSTOM_QUESTIONS_WITH_METADATA):
        tagged_question = f"{qa['question']} [TAGS: {', '.join(metadata_tags)}]"
        records.append({
            'filename': filename,
            'question': tagged_question,
            'answer': qa['answer']
        })

    return records

# === Batch Processor ===
def process_batch_to_dataframe(source_dir, processed_dir, batch_size=10):
    bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2")
    os.makedirs(processed_dir, exist_ok=True)

    txt_files = list(Path(source_dir).glob("*.txt"))
    batch_files = txt_files[:batch_size]

    if not batch_files:
        print("No unprocessed files found.")
        return pd.DataFrame()

    all_records = []

    for file_path in batch_files:
        records = process_file(file_path, bedrock_client)
        all_records.extend(records)
        shutil.move(str(file_path), os.path.join(processed_dir, file_path.name))

    return pd.DataFrame(all_records)

# === Run Script ===
if __name__ == "__main__":
    source_folder = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/processed_programs"
    processed_folder = "/Users/ethanschultz/Documents/GSB_570_GEN_AI/Code:Project/cleaned_programs"

    v2batch5 = process_batch_to_dataframe(
        source_dir=source_folder,
        processed_dir=processed_folder,
        batch_size=11
    )

Processing: CSUIP Germany_ University of Konstanz.txt
Processing: Cal Poly Global Program_ Cal Poly in London_ Campuswide (Summer).txt
Processing: Cal Poly Global Program_ Cal Poly in Thailand_ Campuswide (Summer).txt
Processing: Cal Poly Global Program_ Cal Poly in Rome_ Mechanics.txt
Processing: CSUIP United Kingdom_ University of Birmingham.txt
Processing: Cal Poly Exchange with University of Milan, Italy (OCOB).txt
Processing: Cal Poly Exchange with Hochschule Munchen University of Applied Sciences (Munich) - ARCHITECTURE.txt
Processing: Cal Poly Global Program_ Cal Poly in Rome_ Architecture (Fall).txt
Processing: CSUIP Ghana_ University of Ghana.txt
Processing: CSUIP Japan_ Waseda University.txt
Processing: Cal Poly Exchange with Lucerne University of Applied Sciences (SCIENCE & MATH).txt


Due to throttling limitations I had to do the programs in batches.

In [308]:
questions_answers = pd.concat([batch1, batch2,batch3,batch4,batch5,batch6], axis=0, ignore_index=True)


In [311]:
questions_answers['embedding'] = questions_answers['question'].apply(get_titan_embedding)


In [313]:
questions_answers['filename'] = questions_answers['filename'].str.replace('.txt', '', regex=False)

In [314]:
questions_answers = questions_answers.merge(areas_of_study, on='filename', how='inner')

In [316]:
questions_answers_final = pd.concat([questions_answers, questions_answers2], axis=0, ignore_index=True)

In [320]:
import json

questions_answers_final['embedding'] = questions_answers_final['embedding'].apply(lambda x: json.dumps(x) if isinstance(x, list) else x)


In [322]:
questions_answers_final.to_csv("qa_pairs_with_embeddings.csv", index=False)