In [1]:
from bs4 import BeautifulSoup

def get_individual_job(soup):
    """
    Extracts the following fields from a single job listing:
      - job_title
      - link
      - company_name
      - location
      - days_posted
    """

    job_data = {
        "job_title": "",
        "link": "",
        "company_name": "",
        "location": "",
        "days_posted": ""
    }

    # 1) Find all top-level tables with role="presentation"
    #    (recursive=False ensures we only look at direct children of 'soup')
    all_tables = soup.find_all("table", {"role": "presentation"}, recursive=False)
    
    # If for some reason none found, return empty
    if not all_tables:
        return job_data
    
    # 2) Identify the "outer" table by checking which one has multiple <tr> children
    job_table = None
    for t in all_tables:
        rows = t.find_all("tr", recursive=False)
        if len(rows) >= 4:
            # We found a table that has at least 4 or 5 rows (title, company, location, desc, days posted)
            job_table = t
            break
    
    if not job_table:
        return job_data
    
    # 3) Now gather the top-level <tr> rows of the outer table
    all_rows = job_table.find_all("tr", recursive=False)

    # We expect at least 5 <tr> rows (based on your sample).
    # If there are fewer, bail out or handle gracefully.
    if len(all_rows) < 5:
        return job_data
    
    # ----------------------------
    # Row 0: Job Title & Link
    # ----------------------------
    row_0 = all_rows[0]
    h2_tag = row_0.find("h2")
    if h2_tag:
        a_tag = h2_tag.find("a")
        if a_tag:
            job_data["job_title"] = a_tag.get_text(strip=True)
            job_data["link"] = a_tag.get("href", "")

    # ----------------------------
    # Row 1: Company Name (nested table)
    # ----------------------------
    row_1 = all_rows[1]
    nested_table = row_1.find("table")  # The inner table that has the company name
    if nested_table:
        # The first <td> in the nested <tr> typically has the company name
        sub_tds = nested_table.find_all("td", recursive=True)
        if sub_tds:
            job_data["company_name"] = sub_tds[0].get_text(strip=True)

    # ----------------------------
    # Row 2: Location
    # ----------------------------
    row_2 = all_rows[2]
    location_text = row_2.get_text(strip=True)
    # Optionally remove trailing "• Hybrid remote" or similar
    if "•" in location_text:
        location_text = location_text.split("•", 1)[0].strip()
    job_data["location"] = location_text

    # ----------------------------
    # Row 4: Days Posted
    # (Row 3 might be job description.)
    # ----------------------------
    row_4 = all_rows[4]
    days_posted_text = row_4.get_text(strip=True)
    job_data["days_posted"] = days_posted_text

    return job_data


# ----------------------------
# SAMPLE USAGE
# ----------------------------
if __name__ == "__main__":

    sample_html = """
    <table align="left" border="0" cellpadding="0" cellspacing="0" role="presentation" width="100%">
    <tr>
      <td align="left" valign="top">
        <h2 style="color:#2d2d2d;font-family:'Indeed Sans', 'Noto Sans', Helvetica, Arial, sans-serif;font-size:16px;font-weight:bold;line-height:24px;Margin:0;padding:0">
          <a href="https://ca.indeed.com/rc/clk/dl?jk=fc622f9c1dbca63e..."
             style="color:#2d2d2d;font-weight:bold;text-decoration:underline">
            Senior Risk Analyst, Regulatory Reporting &amp; Analytics, Capital Market Risk Management
          </a>
        </h2>
      </td>
    </tr>
    <tr>
      <td align="left" valign="top">
        <table border="0" cellpadding="0" cellspacing="0" role="presentation">
          <tr>
            <td style="padding:0 12px 0 0;color:#2d2d2d;font-size:14px;line-height:21px">
              CIBC
            </td>
            <td style="color:#2d2d2d;font-size:16px;line-height:24px">
              <strong>3.7</strong>
            </td>
            <td style="padding:2px 0 0 2px">
              <img alt="3.7/5 rating" src="https://prod.statics.indeed.com/eml/assets/images/icons/Star_neutral_whitebg.png" width="20"/>
            </td>
          </tr>
        </table>
      </td>
    </tr>
    <tr>
      <td align="left" style="color:#2d2d2d;font-size:14px;line-height:21px" valign="top">
        Toronto, ON • Hybrid remote
      </td>
    </tr>
    <tr>
      <td align="left" style="padding:0;color:#767676;font-size:14px;line-height:21px" valign="top">
        You also have some experience in the areas of data warehousing...
      </td>
    </tr>
    <tr>
      <td align="left" style="padding:8px 0 0;color:#767676;font-size:12px;line-height:18px" valign="top">
        1 day ago
      </td>
    </tr>
    </table>
    """

    soup = BeautifulSoup(sample_html, "html.parser")
    info = get_individual_job(soup)
    print(info)
    # Expect something like:
    # {
    #   'job_title': 'Senior Risk Analyst, Regulatory Reporting & Analytics, Capital Market Risk Management',
    #   'link': 'https://ca.indeed.com/rc/clk/dl?jk=fc622f9c1dbca63e...',
    #   'company_name': 'CIBC',
    #   'location': 'Toronto, ON',
    #   'days_posted': '1 day ago'
    # }


{'job_title': 'Senior Risk Analyst, Regulatory Reporting & Analytics, Capital Market Risk Management', 'link': 'https://ca.indeed.com/rc/clk/dl?jk=fc622f9c1dbca63e...', 'company_name': 'CIBC', 'location': 'Toronto, ON', 'days_posted': '1 day ago'}


In [2]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [9]:
import os

def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

# list_files(".")

'cat' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
docker run -d \
    --name my-python-app \
    -v /path/to/project/logs:/app/logs \
    -v /path/to/project/data:/app/data \
    my-python-app
