In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
url = "https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=27"

In [14]:
response = requests.get(url)
response.raise_for_status()  # Raise an HTTPError for bad responses
soup = BeautifulSoup(response.text, 'html.parser')
type(soup)

In [9]:
job_elements = soup.find_all('div', class_='jobs-wrapper')
len(job_elements) # should be 1 becuase we have just one table in page

1

In [10]:
type(job_elements)

In [16]:
jobs = job_elements[0].find_all('a', class_ = 'card job cms-jobs-counter w-inline-block')
len(jobs) # must be 15 because each job in each page is 15

15

In [17]:
type(jobs)

In [20]:
for job in jobs:
    print(job)

<a class="card job cms-jobs-counter w-inline-block" href="/job/data-analyst-pricing-and-promotions"><div class="html-embed-5 w-embed"><input class="nocodelytics-list-item" type="hidden" value="data-analyst-pricing-and-promotions"/></div><div class="html-embed-4 w-embed"><input class="jetboost-list-item" type="hidden" value="data-analyst-pricing-and-promotions"/></div><div class="split-content card-job-left"><div class="image-wrapper card-job-mobile-image"><img alt=" Data Analyst, Pricing and Promotions at etrailer" class="image card-job" height="75" loading="lazy" src="https://cdn.prod.website-files.com/6357ed6d70d64dc6b11de190/65c49de240c072c471cb4eb5_BCwjpvAF9kKaEDC4VOd9-Mm1qBNoPr1WkCkbYAmXmzc.jpeg" width="75"/></div><div class="card-job-title-wrapper"><div class="card-link-wrapper-mobile-standard"><div class="card-link-home">etrailer</div></div><h3 class="title h6-size card-job"> Data Analyst, Pricing and Promotions</h3><div class="card-link-wrapper"><div class="card-link-home">etra

#### now it is ready to ask gemini.ai to extract every value for job in jobs.

# Task
Scrape all data analyst job listings from the website "https://www.dataanalyst.com/data-analyst-jobs", iterating through all available pages, and save the collected information to a CSV file named 'job_listings.csv'.

## Iterate through pages

### Subtask:
Modify the web scraping script to iterate through the job listing pages, dynamically changing the 'page' parameter in the URL. This will likely involve a `while` loop that continues as long as job listings are found on a page.


**Reasoning**:
The subtask requires modifying the web scraping script to iterate through multiple pages, collect all job listings, and handle potential errors. This code block will implement a while loop to achieve this, dynamically constructing URLs and extracting data until no more job listings are found.



In [24]:
import requests
from bs4 import BeautifulSoup

all_job_listings = []
page_number = 1

while True:
    url = f"https://www.dataanalyst.com/data-analyst-jobs?c834b559_page={page_number}"
    print(f"Scraping page: {page_number}, URL: {url}")

    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main container for all job listings
        jobs_wrapper = soup.find('div', class_='jobs-wrapper')

        if jobs_wrapper:
            # Find all individual job cards (<a> tags) within the wrapper
            job_cards = jobs_wrapper.find_all('a', class_='card job cms-jobs-counter w-inline-block')

            if not job_cards:
                print(f"No job listings found on page {page_number}. Breaking loop.")
                break  # No more job listings, break the loop

            for job in job_cards:
                # Extract Job Title
                title_tag = job.find('h3', class_='title h6-size card-job')
                title = title_tag.get_text(strip=True) if title_tag else 'N/A'

                # Extract Company Name
                company_tag = job.find('div', class_='card-link-home')
                company = company_tag.get_text(strip=True) if company_tag else 'N/A'

                # Extract Location
                location_tag_container = job.find('div', class_='card-job-category-wrapper-location')
                location_tag = location_tag_container.find('div', class_='card-job-category-text') if location_tag_container else None
                location = location_tag.get_text(strip=True) if location_tag else 'N/A'

                # Extract Country
                country = 'N/A'
                country_label = job.find('div', string='Country')
                if country_label and country_label.find_next_sibling('div', class_='card-job-category-text'):
                    country = country_label.find_next_sibling('div', class_='card-job-category-text').get_text(strip=True)

                # Extract Workplace
                workplace_tag_container = job.find('div', class_='card-job-category-wrapper-workplace')
                workplace_tag = workplace_tag_container.find('div', class_='card-job-category-text') if workplace_tag_container else None
                workplace = workplace_tag.get_text(strip=True) if workplace_tag else 'N/A'

                # Extract Experience
                experience_tag_container = job.find('div', class_='card-job-category-wrapper-experience')
                experience_tag = experience_tag_container.find('div', class_='card-job-category-text') if experience_tag_container else None
                experience = experience_tag.get_text(strip=True) if experience_tag else 'N/A'

                # Extract Industry
                industry_tag_container = job.find('div', class_='card-job-category-wrapper-industry')
                industry_tag = industry_tag_container.find('div', class_='card-job-category-text') if industry_tag_container else None
                industry = industry_tag.get_text(strip=True) if industry_tag else 'N/A'

                # Extract Salary
                salary_tag_container = job.find('div', class_='card-job-category-wrapper-salary')
                salary_tag = salary_tag_container.find('div', class_='card-job-category-text') if salary_tag_container else None
                salary = salary_tag.get_text(strip=True) if salary_tag else 'N/A'

                # Extract Published Date
                published_tag_container = job.find('div', class_='card-job-category-wrapper-published')
                published_tag = published_tag_container.find('div', class_='card-job-category-text') if published_tag_container else None
                published_date = published_tag.get_text(strip=True) if published_tag else 'N/A'


                all_job_listings.append({
                    'Job Title': title,
                    'Company': company,
                    'Location': location,
                    'Country': country,
                    'Workplace': workplace,
                    'Experience': experience,
                    'Industry': industry,
                    'Salary': salary,
                    'Published Date': published_date
                })
            page_number += 1
        else:
            print("Main job listings container ('jobs-wrapper') not found on the page. Breaking loop.")
            break

    except requests.exceptions.HTTPError as errh:
        print(f"Http Error: {errh}")
        break
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
        break
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
        break
    except requests.exceptions.RequestException as err:
        print(f"Something went wrong: {err}")
        break
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        break

# After the loop, print the results
print(f"\nTotal job listings collected: {len(all_job_listings)}")
if all_job_listings:
    print("First 5 collected job listings:")
    for i, listing in enumerate(all_job_listings[:5]):
        print(f"  {i+1}. {listing}")
else:
    print("No job listings were collected.")

Scraping page: 1, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=1
Scraping page: 2, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=2
Scraping page: 3, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=3
Scraping page: 4, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=4
Scraping page: 5, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=5
Scraping page: 6, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=6
Scraping page: 7, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=7
Scraping page: 8, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=8
Scraping page: 9, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=9
Scraping page: 10, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=10
Scraping page: 11, URL: https://www.dataanalyst.com/data-analyst-jobs?c834b559_page=11
Scraping page: 12, URL: https://www.dataanalyst.com/data-anal

**Reasoning**:
The previous step successfully scraped all job listings and stored them in a list of dictionaries named `all_job_listings`. The next step is to convert this list into a pandas DataFrame and then save it to a CSV file as specified in the main task.



In [26]:
import pandas as pd

# Convert the list of dictionaries to a pandas DataFrame
df_job_listings = pd.DataFrame(all_job_listings)

# Define the CSV filename
csv_filename = 'job_listings.csv'

# Save the DataFrame to a CSV file
df_job_listings.to_csv(csv_filename, index=False)

print(f"Successfully saved {len(df_job_listings)} job listings to {csv_filename}")
print("First 5 rows of the DataFrame:")
df_job_listings.head()

Successfully saved 418 job listings to job_listings.csv
First 5 rows of the DataFrame:


Unnamed: 0,Job Title,Company,Location,Country,Workplace,Experience,Industry,Salary,Published Date
0,Data Analyst,Thomson Reuters,"San Diego, CA",United States,In-office,0 - 3 years,Tech,"$72,100 - $133,900","Nov 3, 2025"
1,"Data Analyst, Insights and Reporting",Sony Music Entertainment,"New York, NY",United States,In-office,0 - 3 years,Entertainment,"$64,000 - $70,000","Aug 27, 2025"
2,"Senior Data Analyst, Insights and Analytics",Nintendo,"Redmond, WA",United States,Hybrid,5 - 10 years,Tech,"$105,300 - $168,500","Aug 27, 2025"
3,Lead Data Analyst,SeatGeek,"New York, NY",United States,In-office,5 - 10 years,Tech,"$125,000 - $180,000","Aug 27, 2025"
4,Data Analyst,Rent The Runway,"Brooklyn, NY",United States,In-office,3 - 5 years,Retail,"$100,000 - $125,000","Aug 27, 2025"


## Summary:

### Data Analysis Key Findings
*   The web scraping process successfully collected a total of 418 data analyst job listings from the website.
*   The script iterated through 28 pages of job listings, stopping when no further listings were found on page 29.
*   For each listing, the following details were extracted: 'Job Title', 'Company', 'Location', 'Country', 'Workplace', 'Experience', 'Industry', 'Salary', and 'Published Date'. Missing information was recorded as 'N/A'.
*   All collected job listings were successfully compiled into a Pandas DataFrame and saved to a CSV file named `job_listings.csv`.

### Insights or Next Steps
*   The `job_listings.csv` file now contains a comprehensive dataset of data analyst jobs, which can be used for further analysis such as market trends, salary benchmarking, and demand for specific skills or locations.
*   Given that web structures can change, regular maintenance and testing of the scraping script would be beneficial to ensure its continued functionality and accuracy in data collection.
