In [140]:
# imports
import csv
import random
import requests
import time
import uuid
import webbrowser
import os
from pathlib import Path
from os import PathLike
from typing import Iterable
from urllib.parse import urlparse

import pandas as pd
import undetected_chromedriver as uc
import yaml
from bs4 import BeautifulSoup
from rich import print
from selenium import webdriver
from tqdm import tqdm


In [141]:
# define save directory for csv files
csv_dir = Path('csv_files')
csv_dir.mkdir(exist_ok=True, parents=True)

# define the masters program entry url for parsing
masters_entry_url = r'https://www.mastersportal.com/search/master/msc/physics?de=fulltime&mh=face2face&sorting=tuition'

# define the directory for obsidian
obsidian_dir = Path(os.path.expanduser('~'), 'obsnotes', 'gradnotes')
research_dir = obsidian_dir / 'options'

# define new keys for the user to research, key:desc
research_tasks = {
    'uni-url': 'University homepage URL',
    'dept-url': 'Department homepage URL',
    'program-url': 'Program specific URL',
    'funding-url': 'Funding information URL',
    'scholarship-url': 'Additional funding sources URL',
    # 'research-url': 'Homepage of department research URL',
    'app-url': 'Link to the application page',
    'app-fee': 'Application fee in USD$',
    # 'tuition-est': 'Estimated cost of attendance in USD$/yr',
    # 'funding-est': 'Estimated annual funding in USD$/yr',
    # 'research-topic': 'Enter the most preferred research topic available',
    # 'veto': 'True/False for automatic veto for any reason',
    # 'completed': 'Mark True when completed analysis',
}

# describe some default document headers
document_prompts = {
    'Research Preferences': '*Describe potential research groups and topics*',
    'Funding Practices': '*Describe the costs of living and potential funding sources*',
    'Application Requirements': '*Describe application requirements*',
}


# Functions

## Program Loading

In [142]:
def open_with_cloudflare(url: str, driver: webdriver) -> None:

    # open the url with the driver
    driver.get(url)

    # check if human verification is needed
    if "Just a moment" in driver.title:
        print("[yellow]Cloudflare verification detected, waiting for it to complete...[/yellow]")
        driver.implicitly_wait(10)

        # wait for the page to load
        while "Just a moment" in driver.title:
            time.sleep(1)

    # check if the page has loaded properly
    if driver.current_url == url:
        pass


In [143]:
def parse_search_results(soup: BeautifulSoup) -> list[dict]:

    # prepare results list
    results = []

    # get all ul object of class "SearchResultList" and ensure there is only one
    if len((ul_objects := soup.find_all('ul', class_='SearchResultsList'))) != 1:
        print("[red]Error: Expected exactly one SearchResultList, found {}.[/red]".format(len(ul_objects)))

    # get every li item from the ul object
    li_items = ul_objects[0].find_all('li')

    # iterate over the li_items
    for li in li_items:
        # prepare the result item
        result_dict = {}

        # get the card url
        a_tag = li.find('a', class_='SearchStudyCard')
        url = a_tag['href'] if a_tag else None
        result_dict['url-sp'] = str(url).strip()

        # get degree information
        header_text = li.find('h2', class_='StudyName')
        result_dict['degree'] = str(header_text.text).strip() if header_text else None

        # get university information
        org_name = li.find('strong', class_='OrganisationName')
        org_location = li.find('strong', class_='OrganisationLocation')
        result_dict['university'] = str(org_name.text).strip() if org_name else None
        result_dict['location'] = str(org_location.text).strip() if org_location else None

        # append the result dict if not all None values
        if all(r is not None for r in result_dict.values()):
            # add uuid to the result dict
            result_dict['uuid'] = str(uuid.uuid4())
            results.append(result_dict)

    return results


In [144]:
def get_all_programs(driver: webdriver, entry_url: str) -> list[dict]:

    # open a webdriver and navigate to entry url
    open_with_cloudflare(entry_url, driver)

    # first find how many pages exist
    nav_tag = driver.find_element("css selector", "nav.PagNavigationContainer")
    see_more_label = nav_tag.find_element("css selector", "p.SeeMoreLabelVar1")
    page_count_text = see_more_label.text.split(' of ')

    # ensure only two elements are in page_count_text
    if len(page_count_text) != 2:
        print(f"[red]Error: Expected two elements in page_count_text, found {len(page_count_text)}.[/red]")
        driver.quit()

    # get the total number of pages
    total_pages = int(page_count_text[1].strip())
    print(f"[green]Total pages found: {total_pages}[/green]")

    # iterate over the pages
    results = []
    for page in tqdm(range(1, total_pages + 1), desc="Scraping programs"):
        # scrape the page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        page_results = parse_search_results(soup)
        results.extend(page_results)

        # navigate to the next page
        if page < total_pages:
            nav_tag = driver.find_element("css selector", "nav.PagNavigationContainer")
            next_page = nav_tag.find_element("css selector", 'a[title="Next page"]')
            next_page.click()
            time.sleep(5 + (5*random.random()))

    return results


In [145]:
def attempt_load_all(file: PathLike, entry_url: str = None, delimiter: str = ';') -> pd.DataFrame:

    # generate file if it does not exist
    if not (file := Path(file)).exists():
        print(f"[yellow]File {file} does not exist, creating a new DataFrame.[/yellow]")

        # check that entry_url exists
        if entry_url is None:
            raise ValueError("entry_url must be provided if the file does not exist")

        # attempt to scrape for programs database
        driver = uc.Chrome(use_subprocess=True, headless=False)
        results = get_all_programs(driver, entry_url)
        driver.quit()

        # save the results to a csv file
        with file.open('w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(
                csvfile,
                fieldnames=list(results[0].keys()),
                delimiter=delimiter,
            )
            writer.writeheader()
            writer.writerows(results)

    # attempt to load
    try:
        # print(f"[green]Loaded existing data from {file}[/green]")
        return pd.read_csv(file, delimiter=delimiter)
    except Exception as e:
        print(f"[red]Error loading CSV: {e}[/red]")
        raise e
    return None


## Program Scoring

In [146]:
def score_by_string(
    df: pd.DataFrame,
    col: str,
    key: Iterable[str],
    adjust: int,
    exact_match: bool = False,
    only_one: bool = False,
    ) -> pd.DataFrame:

    # check that col in df
    if col not in df.columns:
        return df

    # check if key is a single object, such as a string
    if isinstance(key, str):
        key = [key]
    elif not isinstance(key, Iterable):
        raise TypeError("key must be a string or an iterable of strings")

    # adjust score by containment or exact match
    starting_score = df['score'].copy()
    for k in key:
        if exact_match:
            df.loc[df[col] == k, 'score'] += adjust
        else:
            df.loc[df[col].str.contains(k, case=False, na=False), 'score'] += adjust

    # if only_one, ensure all scores are within adjust of the starting score
    if only_one:
        df.loc[df['score'] > starting_score + adjust, 'score'] = starting_score + adjust
        df.loc[df['score'] < starting_score - adjust, 'score'] = starting_score - adjust

    # sort by score descending
    df.sort_values(by='score', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


In [147]:
def dedupe_by_col(df: pd.DataFrame, group_col: str, compare_col: str) -> pd.DataFrame:
    """
    Deduplicate DataFrame by group_col, keeping the row with the highest score for each group.
    """
    if group_col not in df.columns:
        raise ValueError(f"Column '{group_col}' not found in DataFrame")
    if compare_col not in df.columns:
        raise ValueError(f"Column '{compare_col}' not found in DataFrame")

    # remove duplicates in group_col and keep the highest compare_col
    df = df.loc[df.groupby(group_col)[compare_col].idxmax()]

    # sort by compare_col
    df.sort_values(by=compare_col, ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


In [148]:
def first_score_masters(df: pd.DataFrame, reset: bool = True, score_cut: int = 0, dedupe: bool = True) -> pd.DataFrame:

    # reset scoring
    if reset:
        df['score'] = 0

    # disqualifying
    df = score_by_string(
        df=df,
        col='degree',
        key=[
            'Appl', 'Engineering', 'Life', 'Photonics', 'Bio', 'Modelling', 'Material',
            'Molecular', 'Atmos', 'Medical', 'Solid', 'Catalys', 'Comp', 'Nano', 'Simulation',
            'Communication', 'Education', 'Climate', 'Neuro', 'Tech', 'Data Science',
            'Architecture', 'Information', 'Computer', 'Sustainability', 'Business', 'Manage',
            'Optics', 'Systems', 'Instrument', 'Science', 'Geo', 'Chemistry', 'Env',
            'Experimental',
        ],
        adjust=-10,
    )

    df = score_by_string(
        df=df,
        col='location',
        key=[
            'Singapore', 'South Africa', 'Malaysia', 'India', 'Pakistan', 'United Arab Emirates',
            'Hungary', 'Russia', 'Israel', 'Egypt', 'Saudi Arabia', 'Turkey', 'Oman', 'Thailand',
            'United States', 'Australia', 'Kazakhstan', 'Multiple locations', 'China'
        ],
        adjust=-10,
    )

    # non-prefer
    df = score_by_string(
        df=df,
        col='location',
        key=[
            'Spain', 'Italy', 'United Kingdom', 'Finland', 'Bulgaria', 'Kenya', 'Uganda', 'Taiwan',
            'Poland', 'Namibia', 'Nepal', 'Iceland'
        ],
        adjust=-1,
    )

    # prefer
    df = score_by_string(
        df=df,
        col='degree',
        key='Physics',
        adjust=1,
        exact_match=True,
    )

    df = score_by_string(
        df=df,
        col='degree',
        key=[
            'theor', 'math'
        ],
        adjust=4,
        only_one=True,
    )

    df = score_by_string(
        df=df,
        col='degree',
        key=[
            'particle', 'nuclear', 'grav', 'cosmology'
        ],
        adjust=3,
        only_one=True,
    )

    df = score_by_string(
        df=df,
        col='location',
        key=[
            'Germany', 'Denmark', 'Netherlands', 'Estonia', 'New Zealand', 'Austria'
        ],
        adjust=1,
    )

    # eliminate programs with scores <= 0
    if score_cut is not None:
        df = df[df['score'] > score_cut]

    # de-duplicate entries
    if dedupe:
        # if there are multiple entries with the same 'university'
        # then keep only the row with the highest 'score'
        df = dedupe_by_col(df, 'university', 'score')

    return df


## Research Vault

In [149]:
def name_to_filename(name: str) -> str:

    return name.replace(' ', '_'
                ).replace('\'', ''
                ).replace('.', '_'
                ).replace('_-_', '_'
                ).replace('-', '_'
                ).replace('(', '_'
                ).replace(')', '_'
                ).replace('__', '_'
                )


In [150]:
def read_yaml(file: PathLike) -> dict:

    # check that file exists
    if not (file := Path(file)).exists():
        raise FileNotFoundError(f"File '{file}' does not exist")

    # read the yaml file
    with file.open('r', encoding='utf-8') as f:
        content = f.read()

    if not content:
        raise ValueError(f"File '{file}' is empty")

    # parse the yaml content
    if len(yaml_str := content.split('---')) < 2:
        raise ValueError(f"File '{file}' does not contain valid YAML header")

    if not (yaml_str := yaml_str[1].strip()):
        raise ValueError(f"File '{file}' does not contain valid YAML content")

    return yaml.safe_load(yaml_str)


In [151]:
def create_vault(
    df: pd.DataFrame,
    dir: PathLike,
    doc_prompts: dict[str, str] = None,
    overwrite: bool = False,
    ) -> None:

    # check that vault_dir exists
    dir = Path(dir)
    if not dir.exists():
        raise FileNotFoundError(f"Vault directory '{dir}' does not exist")

    # add research task keys to masters_df
    for key in research_tasks.keys():
        if key not in df.columns:
            df[key] = ''

    # iterate over rows of the df to create research files
    for _, row in df.iterrows():
        filename = name_to_filename(row['university'].strip().lower()) + '.md'
        if (not (filepath := (dir / filename)).exists()) or overwrite:
            with filepath.open('w', encoding='utf-8') as f:
                yaml_str = '---\n' + '\n'.join([f"{k}: {v}" for k, v in row.to_dict().items()]) + '\n---'
                f.write(yaml_str)
                if doc_prompts:
                    for doc_title, doc_prompt in doc_prompts.items():
                        f.write(f"\n# {doc_title}\n{doc_prompt}\n")


# Masters Programs

In [152]:
# attempt to load from obsidian dir vault
if not research_dir.exists():
    # attempt to load the scored programs file
    if not (scored_masters_file := csv_dir / 'masters_scored.csv').exists():
        # attempt to load the full program file
        masters_df = attempt_load_all(
            file=(all_masters_file := csv_dir / 'masters_all.csv'),
            entry_url=masters_entry_url,
        )

        # score, cut, and save
        masters_df = first_score_masters(
            df=masters_df,
            reset=True,
            score_cut=0,
            dedupe=True,
        )

        masters_df.to_csv(
            scored_masters_file,
            index=False,
            encoding='utf-8',
            sep=';',
        )

    # read, sort, and re-index
    masters_df = pd.read_csv(scored_masters_file, delimiter=';')

    # update columns for user input
    for key in research_tasks.keys():
        if key not in masters_df.columns:
            masters_df[key] = ''

    # masters_df['completed'] = False
    # masters_df['veto'] = False
    # masters_df['score'] = 0
    masters_df['icon'] = 'LiUniversity'
    # masters_df['started'] = 'False'

    # create the research vault for further user input
    research_dir.mkdir(exist_ok=True, parents=False)
    create_vault(
        df=masters_df,
        dir=research_dir,
        doc_prompts=document_prompts,
    )
# else:
#     # iterate over each file, construct dict from yaml header, turn to df, sort by score
#     dict_list = []
#     for f in research_dir.glob('*.md'):
#         f_dict = read_yaml(f)
#         f_dict['veto'] = bool(f_dict['veto'])
#         f_dict['completed'] = bool(f_dict['completed'])
#         dict_list.append(f_dict)

#     # convert to masters_df and cut
#     masters_df = pd.DataFrame(dict_list)
#     masters_df = masters_df[(masters_df['completed']) & (~masters_df['veto'])]

# sort and re-index
# masters_df.sort_values(by='score', ascending=False, inplace=True)
# masters_df.reset_index(drop=True, inplace=True)


In [153]:
# masters_df[[
#     'degree', 'university', 'location', 'research-topic', 'score',
# ]]
