# **Scrape and Analyze Job Listings for Data Analyst Roles**

In [28]:
# Install BeautifulSoup and requests (usually already available in Colab)
!pip install beautifulsoup4

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from collections import Counter

print("Libraries imported successfully!")

Libraries imported successfully!


In [29]:
# URL of Internshala’s Data Analytics Internship listings
url = 'https://internshala.com/internships/data-analytics-internship'

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0'
}

print("URL and headers are set!")

URL and headers are set!


In [30]:
# Send request to the website
response = requests.get(url, headers=headers)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

print("Web page content loaded and parsed!")

Web page content loaded and parsed!


In [31]:
# Find all internship cards on the page
job_cards = soup.find_all('div', class_='individual_internship')

print(f"Found {len(job_cards)} internships on the page!")

Found 41 internships on the page!


In [52]:
internships = []

for i, job in enumerate(job_cards):
    # Re-inspecting the HTML structure for title, company, and location
    # It seems the title is within an h3 tag with class 'heading_4_5'
    title_element = job.find('h3', class_='heading_4_5')
    title = title_element.text.strip() if title_element else "N/A"
    print(f"Internship {i} Title: {title}") # Debug print

    # The company name is within an h4 tag with class 'heading_6'
    company_element = job.find('h4', class_='heading_6')
    company = company_element.text.strip() if company_element else "N/A"
    print(f"Internship {i} Company: {company}") # Debug print

    # The location is within a div with class 'item_body' and a tag with class 'location_link'
    location_div = job.find('div', class_='item_body')
    location_element = location_div.find('a', class_='location_link') if location_div else None
    location = location_element.text.strip() if location_element else "N/A"
    print(f"Internship {i} Location: {location}") # Debug print


    # The description is in a div with class 'other_detail_item' but the content seems inconsistent, let's try to get the first skill or key responsibility
    description_element = job.find('div', class_='other_detail_item')
    # Looking for a div with class 'item_body' which seems to contain the skills or responsibilities
    skills_element = description_element.find('div', class_='item_body') if description_element else None
    description = skills_element.text.strip() if skills_element else ""
    print(f"Internship {i} Description: {description[:50]}...") # Debug print

    internships.append({
        'Title': title,
        'Company': company,
        'Location': location,
        'Description': description
    })

print("Internship details extracted!")

Internship 0 Title: N/A
Internship 0 Company: N/A
Internship 0 Location: N/A
Internship 0 Description: ...
Internship 1 Title: N/A
Internship 1 Company: N/A
Internship 1 Location: N/A
Internship 1 Description: ...
Internship 2 Title: N/A
Internship 2 Company: N/A
Internship 2 Location: N/A
Internship 2 Description: ...
Internship 3 Title: N/A
Internship 3 Company: N/A
Internship 3 Location: N/A
Internship 3 Description: ...
Internship 4 Title: N/A
Internship 4 Company: N/A
Internship 4 Location: N/A
Internship 4 Description: ...
Internship 5 Title: N/A
Internship 5 Company: N/A
Internship 5 Location: N/A
Internship 5 Description: ...
Internship 6 Title: N/A
Internship 6 Company: N/A
Internship 6 Location: N/A
Internship 6 Description: ...
Internship 7 Title: N/A
Internship 7 Company: N/A
Internship 7 Location: N/A
Internship 7 Description: ...
Internship 8 Title: N/A
Internship 8 Company: N/A
Internship 8 Location: N/A
Internship 8 Description: ...
Internship 9 Title: N/A
Internship 9 

In [72]:
# Convert list of dictionaries to a DataFrame
df = pd.DataFrame(internships)

# Show the first 5 entries
display(df.head())

Unnamed: 0,Title,Company,Location,Description
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,


In [30]:
if df.empty:
    print("No internship data available.")

No internship data available.


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import time

In [4]:
url = "https://remoteok.com/remote-data-jobs"
headers = {'User-Agent': 'Mozilla/5.0'}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    print("Page fetched successfully!")
else:
    print("Failed to fetch the page.")

Page fetched successfully!


In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

# Find all job listings
jobs = soup.find_all('tr', class_='job')

print(f"Found {len(jobs)} job listings")

Found 9 job listings


In [6]:
internships = []

for job in jobs:
    title = job.find('h2', itemprop='title')
    company = job.find('h3', itemprop='name')
    location = job.find('div', class_='location')
    date = job.find('time')

    internships.append({
        "Title": title.text.strip() if title else "N/A",
        "Company": company.text.strip() if company else "N/A",
        "Location": location.text.strip() if location else "Worldwide",
        "Date": date['datetime'].strip() if date else "N/A"
    })

print(f"Extracted {len(internships)} job details")

Extracted 9 job details


In [7]:
df = pd.DataFrame(internships)

# Show the first few rows
df.head()

Unnamed: 0,Title,Company,Location,Date
0,,,Worldwide,
1,,,Worldwide,
2,,,Worldwide,
3,,,Worldwide,
4,,,Worldwide,


**Another example**

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import time

In [11]:
url = "https://remoteok.com/remote-data-jobs"
headers = {'User-Agent': 'Mozilla/5.0'}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Page loaded successfully!")
else:
    print("Failed to load page.")

Page loaded successfully!


In [12]:
soup = BeautifulSoup(response.content, 'html.parser')

jobs = soup.find_all('tr', class_='job')

print(f"Found {len(jobs)} job listings (including placeholders)")

Found 9 job listings (including placeholders)


In [13]:
valid_jobs = [job for job in jobs if job.find('h2') and job.find('h3')]

print(f"Found {len(valid_jobs)} valid job listings")

Found 0 valid job listings


In [14]:
internships = []

for job in valid_jobs:
    title = job.find('h2', itemprop='title')
    company = job.find('h3', itemprop='name')
    location = job.find('div', class_='location')
    date = job.find('time')

    internships.append({
        "Title": title.text.strip() if title else "N/A",
        "Company": company.text.strip() if company else "N/A",
        "Location": location.text.strip() if location else "Worldwide",
        "Date": date['datetime'].strip() if date else "N/A"
    })

In [16]:
df = pd.DataFrame(internships)

print("Data extracted successfully!")

Data extracted successfully!


In [17]:
df.head()