Beautiful Soup is a Python library used for parsing HTML and XML documents. Here is the general syntax for using Beautiful Soup:

1. Importing the library:
```
from bs4 import BeautifulSoup
```
2. Creating a BeautifulSoup object:
```
soup = BeautifulSoup(html_string, 'html.parser')
```
* `html_string` is the string containing the HTML code.
* `'html.parser'` is the parser used to parse the HTML code. You can also use `'lxml'` or `'xml'` parsers.

3. Finding elements:
```
soup.find('tag_name')  # finds the first occurrence of the tag
soup.find_all('tag_name')  # finds all occurrences of the tag
soup.find('tag_name', {'attribute_name': 'attribute_value'})  # finds the first occurrence of the tag with the specified attribute
soup.find_all('tag_name', {'attribute_name': 'attribute_value'})  # finds all occurrences of the tag with the specified attribute
```
* `tag_name` is the name of the HTML tag you want to find.
* `attribute_name` and `attribute_value` are the name and value of the attribute you want to filter by.

4. Navigating the tree:
```
soup.parent  # returns the parent element
soup.children  # returns a list of child elements
soup.next_sibling  # returns the next sibling element
soup.previous_sibling  # returns the previous sibling element
```
5. Modifying the tree:
```
soup.tag_name.string  # returns the text content of the tag
soup.tag_name.text  # returns the text content of the tag, including child elements
soup.tag_name.append(new_tag)  # adds a new tag to the end of the tag
soup.tag_name.insert(0, new_tag)  # inserts a new tag at the beginning of the tag
soup.tag_name.replace_with(new_tag)  # replaces the tag with a new tag
```
* `new_tag` is the new tag you want to add or replace.

6. Extracting data:
```
soup.get_text()  # returns the text content of the entire document
soup.find('tag_name').get_text()  # returns the text content of the specified tag
soup.find('tag_name').attrs  # returns a dictionary of the tag's attributes
```
These are the basic syntax and methods for using Beautiful Soup. You can find more information and examples in the official Beautiful Soup documentation.

In [1]:
from bs4 import BeautifulSoup

with open('home.html', 'r') as html_file:
    content =html_file.read()
    
    soup = BeautifulSoup(content, 'lxml')
    course_cards =soup.find_all('div', class_ = 'card-body')
    for course in course_cards:
        course_price = course.a.text.split()[-1]
        print(f'The course {course.h5.text} cost ${course_price[:-1]}')

The course Python for beginners cost $20
The course Python Web Development cost $50
The course Python Machine Learning cost $100


For practicing, I will scrape data from a job listing website. 
The details of the extract will 

- Company name
- Job title
- Posting date 
- skills 
- more details which shows the url to find the job.


This is just a simple extraction process.

In [8]:
from bs4 import BeautifulSoup
import requests
import re
import time
import pandas as pd
import os



# Function to extract skill required from job post
def extract_skills(skills_container):
    if skills_container:
        skills = re.findall(r'<span[^>]*>(.*?)</span>', str(skills_container), re.DOTALL)
        skills_list = [
            skill.strip().replace(' / ', '/').replace('**   ', '').replace('  **', '').replace('amp', '')
            for skill in skills if skill.strip()
        ]
        return skills_list
    return []


# function to extract data from website html
def find_jobs():
    try:
        response = requests.get(
            'https://www.timesjobs.com/candidate/job-search.html?searchType=Home_Search&from=submit&asKey=OFF&txtKeywords=&cboPresFuncArea=&cboWorkExp1=0&clusterName=CLUSTER_EXP'
        )
        response.raise_for_status()
        
        if response.status_code == 200:
            html_text = response.text
            soup = BeautifulSoup(html_text, 'lxml')
            job_boxes = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')

            for job_box in job_boxes:
                job_title = job_box.find('a').text.strip()
                posting_date = job_box.find('span', class_='sim-posted').text.strip()
                company_name = job_box.find('h3', class_='joblist-comp-name').text.strip()
                more_details = job_box.a['href']
                    
                # Extract skills
                skills_container = job_box.find('div', class_='more-skills-sections')
                skills = extract_skills(skills_container)
    
                print(f"Company: {company_name}")
                print(f"Job Title: {job_title}")
                print(f"Posting Date: {posting_date}")            
                print(f"Skills: {'|'.join(skills)}")
                print(f"More Details: {more_details}\n")
                    
        else:
            print("failed to connect")
                  

    except requests.exceptions.ConnectionError:
        print("Connection error. Retrying in 10 seconds...")
        time.sleep(10)
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error: {e}. Retrying in 10 seconds...")
        time.sleep(10)
    except Exception as e:
        print(f"An error occurred: {e}")

# Run find_jobs repeatedly with time intervals
if __name__ == '__main__':
    while True:
        find_jobs()
        time_wait = 10
        print(f'Waiting {time_wait} seconds...')
        time.sleep(time_wait)


Company: Dinab Recruitment Solutions
Job Title: Apply for Airport jobs - HR Executive
Posting Date: Posted today
Skills: Airport Management|Ground Staff|Cabin Crew|Air Hostess|Cargo Operations|Cargo Handling|Logistics Coordinator|Logistics Manager|Air Ticketing
More Details: https://www.timesjobs.com/job-detail/apply-for-airport-jobs-hr-executive-airports-airlines-bengaluru-bangalore-hyderabad-secunderabad-mumbai-cochin-kochi-ernakulam-thiruvananthapuram-0-to-3-yrs-jobid-TYPzI__SLASH__gOTw9zpSvf__PLUS__uAgZw==&source=srp

Company: Dinab Recruitment Solutions
Job Title: Apply for Airport jobs - Logistics Manager
Posting Date: Posted today
Skills: Airport Management|Ground Staff|Cabin Crew|Air Hostess|Cargo Operations|Cargo Handling|Logistics Coordinator|Logistics Manager|Air Ticketing
More Details: https://www.timesjobs.com/job-detail/apply-for-airport-jobs-logistics-manager-airports-airlines-bengaluru-bangalore-hyderabad-secunderabad-mumbai-cochin-kochi-ernakulam-thiruvananthapuram-0-t

KeyboardInterrupt: 

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import time
import pandas as pd
import os

# Define the job keyword input once at the start
print('Input Job keyword')
job_keyword = input('>> ')
print(f'Searching for keyword: {job_keyword}')

# Function to extract skills required from job post
def extract_skills(skills_container):
    if skills_container:
        # Using BeautifulSoup to parse the container directly instead of regex
        skills = skills_container.find_all('span')
        skills_list = [skill.get_text(strip=True).replace(' / ', '/').replace('amp', '') for skill in skills]
        return skills_list
    return []

# Function to extract data from website HTML
def find_jobs():
    try:
        response = requests.get(
            'https://www.timesjobs.com/candidate/job-search.html?searchType=Home_Search&from=submit&asKey=OFF&txtKeywords=&cboPresFuncArea=&cboWorkExp1=0&clusterName=CLUSTER_EXP'
        )
        response.raise_for_status()
        
        # Checking if the response status code is 200
        if response.status_code == 200:
            html_text = response.text
            soup = BeautifulSoup(html_text, 'lxml')
            job_boxes = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')
            
            # Temporary list for current jobs
            current_job_data = []

            for job_box in job_boxes:
                job_title = job_box.find('a').text.strip()
                if job_keyword.casefold() in job_title.casefold():
                    posting_date = job_box.find('span', class_='sim-posted').text.strip()
                    company_name = job_box.find('h3', class_='joblist-comp-name').text.strip()
                    more_details = job_box.a['href']
                    
                    # Extract skills
                    skills_container = job_box.find('div', class_='more-skills-sections')
                    skills = extract_skills(skills_container)

                    # Append to list
                    current_job_data.append({
                        'company_name': company_name,
                        'job_title': job_title,
                        'posting_duration': posting_date,
                        'skills': '|'.join(skills),
                        'more_details': more_details
                    })

            # Check for existing CSV file
            file_exists = os.path.isfile('job_data.csv')
            
            # Load existing data if CSV exists
            if file_exists:
                previous_data = pd.read_csv('job_data.csv')
                current_data_df = pd.DataFrame(current_job_data)
                
                # Check if the current and previous data are identical
                if previous_data.equals(current_data_df):
                    print("No new data found; skipping CSV update.")
                    return  
            
            # Write new data to CSV if different
            pd.DataFrame(current_job_data).to_csv('job_data.csv', index=False)
            print("Data updated in CSV file.")

    except requests.exceptions.ConnectionError:
        print("Connection error. Retrying in 10 seconds...")
        time.sleep(10)
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error: {e}. Retrying in 10 seconds...")
        time.sleep(10)
    except Exception as e:
        print(f"An error occurred: {e}")

# Run find_jobs repeatedly with time intervals
if __name__ == '__main__':
    while True:
        find_jobs()
        time_wait = 10  # Time to wait before the next search
        print(f'Waiting {time_wait} seconds...')
        time.sleep(time_wait)


In [None]:
from bs4 import BeautifulSoup
import requests
import re
import time
import pandas as pd
import os

# Define the job keyword input once at the start
print('Input Job keyword')
job_keyword = input('>> ')
print(f'Searching for keyword: {job_keyword}')

# Function to extract skills required from job post
def extract_skills(skills_container):
    if skills_container:
        # Using BeautifulSoup to parse the container directly instead of regex
        skills = skills_container.find_all('span')
        skills_list = [skill.get_text(strip=True).replace(' / ', '/').replace('amp', '') for skill in skills]
        return skills_list
    return []

# Function to extract data from website HTML
def find_jobs():
    try:
        response = requests.get(
            'https://www.timesjobs.com/candidate/job-search.html?searchType=Home_Search&from=submit&asKey=OFF&txtKeywords=&cboPresFuncArea=&cboWorkExp1=0&clusterName=CLUSTER_EXP'
        )
        response.raise_for_status()
        
        # Checking if the response status code is 200
        if response.status_code == 200:
            html_text = response.text
            soup = BeautifulSoup(html_text, 'lxml')
            job_boxes = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')
            
            # Temporary list for current jobs
            current_job_data = []

            for job_box in job_boxes:
                job_title = job_box.find('a').text.strip()
                if job_keyword.casefold() in job_title.casefold():
                    posting_date = job_box.find('span', class_='sim-posted').text.strip()
                    company_name = job_box.find('h3', class_='joblist-comp-name').text.strip()
                    more_details = job_box.a['href']
                    
                    # Extract skills
                    skills_container = job_box.find('div', class_='more-skills-sections')
                    skills = extract_skills(skills_container)

                    # Append to list
                    current_job_data.append({
                        'company_name': company_name,
                        'job_title': job_title,
                        'posting_duration': posting_date,
                        'skills': '|'.join(skills),
                        'more_details': more_details
                    })

            # Check for existing CSV file
            file_exists = os.path.isfile('job_datas.csv')
            
            # Load existing data if CSV exists
            if file_exists and os.stat("job_datas.csv").st_size > 0:  # Check if the file has content
                previous_data = pd.read_csv('job_datas.csv')
                current_data_df = pd.DataFrame(current_job_data)
                
                # Check if the dataframes are identical
                if previous_data.equals(current_data_df):
                    print("No new data found; skipping CSV update.")
                    return
            else:
                print("No existing data found, creating a new CSV file.")
            
            # Write new data to CSV if different
            pd.DataFrame(current_job_data).to_csv('job_datas.csv', index=False)
            print("Data updated in CSV file.")

    except requests.exceptions.ConnectionError:
        print("Connection error. Retrying in 10 seconds...")
        time.sleep(10)
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error: {e}. Retrying in 10 seconds...")
        time.sleep(10)
    except Exception as e:
        print(f"An error occurred: {e}")

# Run find_jobs repeatedly with time intervals
if __name__ == '__main__':
    while True:
        find_jobs()
        time_wait = 10  # Time to wait before the next search
        print(f'Waiting {time_wait} seconds...')
        time.sleep(time_wait)


In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import time
from datetime import datetime
from typing import List, Dict, Optional
import logging
from urllib.parse import quote

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('job_scraper.log'),
        logging.StreamHandler()
    ]
)

class TimesJobsScraper:
    BASE_URL = 'https://www.timesjobs.com/candidate/job-search.html'
    
    def __init__(self, output_file: str = 'job_data.csv'):
        self.output_file = output_file
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def _get_search_url(self, keyword: str) -> str:
        """Construct search URL with proper encoding."""
        params = {
            'searchType': 'personalizedSearch',
            'from': 'submit',
            'txtKeywords': keyword,
            'cboPresFuncArea': '',
        }
        query_string = '&'.join(f"{k}={quote(str(v))}" for k, v in params.items())
        return f"{self.BASE_URL}?{query_string}"

    def _extract_skills(self, skills_container) -> List[str]:
        """Extract skills from the job posting."""
        if not skills_container:
            return []
        skills = skills_container.find_all('span')
        return [skill.get_text(strip=True).replace(' / ', '/').replace('amp;', '') 
                for skill in skills]

    def _parse_job_box(self, job_box, keyword: str) -> Optional[Dict]:
        """Parse individual job posting."""
        try:
            job_title = job_box.find('a').text.strip()
            if keyword.casefold() not in job_title.casefold():
                return None

            return {
                'company_name': job_box.find('h3', class_='joblist-comp-name').text.strip(),
                'job_title': job_title,
                'posting_duration': job_box.find('span', class_='sim-posted').text.strip(),
                'skills': '|'.join(self._extract_skills(job_box.find('div', class_='more-skills-sections'))),
                'more_details': job_box.a['href'],
                'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
        except AttributeError as e:
            logging.warning(f"Error parsing job box: {e}")
            return None

    def _update_csv(self, new_data: List[Dict]) -> bool:
        """Update CSV file with new job data, avoiding duplicates."""
        if not new_data:
            logging.info("No new data to update")
            return False

        new_df = pd.DataFrame(new_data)
        
        try:
            if os.path.exists(self.output_file):
                existing_df = pd.read_csv(self.output_file)
                # Compare only relevant columns for duplicates
                comparison_columns = ['company_name', 'job_title', 'more_details']
                merged_df = pd.concat([existing_df, new_df]).drop_duplicates(
                    subset=comparison_columns, 
                    keep='last'
                )
            else:
                merged_df = new_df

            merged_df.to_csv(self.output_file, index=False)
            return True
            
        except Exception as e:
            logging.error(f"Error updating CSV: {e}")
            return False

    def scrape_jobs(self, keyword: str) -> None:
        """Main function to scrape jobs."""
        url = self._get_search_url(keyword)
        
        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'lxml')
            job_boxes = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')
            
            current_jobs = []
            for job_box in job_boxes:
                job_data = self._parse_job_box(job_box, keyword)
                if job_data:
                    current_jobs.append(job_data)
            
            if current_jobs:
                if self._update_csv(current_jobs):
                    logging.info(f"Found {len(current_jobs)} new jobs matching '{keyword}'")
                else:
                    logging.info("No new unique jobs found")
            else:
                logging.info(f"No jobs found matching '{keyword}'")

        except requests.exceptions.RequestException as e:
            logging.error(f"Request failed: {e}")
        except Exception as e:
            logging.error(f"Unexpected error: {e}")

def main():
    scraper = TimesJobsScraper()
    keyword = input('Enter job keyword to search: ').strip()
    
    if not keyword:
        logging.error("Keyword cannot be empty")
        return

    logging.info(f"Starting job search for keyword: {keyword}")
    
    while True:
        try:
            scraper.scrape_jobs(keyword)
            wait_time = 60  # 1 minute between searches
            logging.info(f"Waiting {wait_time} seconds before next search...")
            time.sleep(wait_time)
        except KeyboardInterrupt:
            logging.info("Scraping stopped by user")
            break
        except Exception as e:
            logging.error(f"Fatal error: {e}")
            break

if __name__ == '__main__':
    main()

2024-11-05 01:31:42,302 - INFO - Starting job search for keyword: analyst
2024-11-05 01:31:48,249 - INFO - Found 25 new jobs matching 'analyst'
2024-11-05 01:31:48,251 - INFO - Waiting 60 seconds before next search...
