In [1]:
import requests

url = "https://internshala.com/internships/data-science-internship"
response = requests.get(url)

print(response.status_code)   # should be 200 if successful
print(response.text[:500])    # print first 500 chars of HTML


200
<!DOCTYPE html>
    <html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml"
        lang="en-US">

<head>
    <meta http-equiv="X-UA-Compatible" content="IE=9" />
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0 user-scalable=0" />
    <meta property="fb:app_id" content="702141670710132" />
    <meta property="og:type" content="website" />
            <meta property="og:image:width"


In [2]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, "html.parser")

# Find all job cards
job_cards = soup.find_all("div", class_="individual_internship")

print("Number of jobs found:", len(job_cards))

# Look at first job
first_job = job_cards[0]
print(first_job.prettify()[:500])   # inspect structure

Number of jobs found: 51
<div class="container-fluid individual_internship view_detail_button visibilityTrackerItem" data-href="/internship/detail/part-time-social-media-internship-in-mumbai-at-griebs-music-pvtltd1756144471" data-source_page="search_page" employment_type="internship" id="individual_internship_2859683" internshipid="2859683" sequential_apply_referral="similar_internships">
 <div class="internship_meta duration_meta">
  <div class="internship-heading-container">
   <div class="company">
    <h3 class="job


In [3]:
jobs = []

for job in job_cards:
    try:
        # Check if elements exist before accessing them
        title_elem = job.find("h3", class_="heading_4_5")
        company_elem = job.find("h4", class_="heading_6")
        link_elem = job.find("a")
        
        if title_elem and company_elem and link_elem:
            title = title_elem.get_text(strip=True)
            company = company_elem.get_text(strip=True)
            link = "https://internshala.com" + link_elem["href"]

            jobs.append({
                "title": title,
                "company": company,
                "url": link
            })
        else:
            print("Skipping job due to missing elements")
            continue
            
    except Exception as e:
        print(f"Error processing job: {e}")
        continue

print(f"Successfully processed {len(jobs)} jobs")
for j in jobs[:5]:   # show first 5 jobs
    print(j)

Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
Skipping job due to missing elements
S

In [4]:
# Debug: Let's examine the actual HTML structure of the first job card
print("=== DEBUGGING HTML STRUCTURE ===")
first_job = job_cards[0]

# Find all h3 and h4 elements to see what classes are available
print("\nAll h3 elements:")
for h3 in first_job.find_all("h3"):
    print(f"  Class: {h3.get('class')}, Text: {h3.get_text(strip=True)[:50]}...")

print("\nAll h4 elements:")
for h4 in first_job.find_all("h4"):
    print(f"  Class: {h4.get('class')}, Text: {h4.get_text(strip=True)[:50]}...")

print("\nAll links:")
for link in first_job.find_all("a"):
    if link.get("href"):
        print(f"  href: {link.get('href')[:50]}...")


=== DEBUGGING HTML STRUCTURE ===

All h3 elements:
  Class: ['job-internship-name'], Text: Social Media...

All h4 elements:

All links:
  href: /internship/detail/part-time-social-media-internsh...


In [5]:
def scrape_internshala(job_title):
    query = job_title.replace(" ", "-")
    url = f"https://internshala.com/internships/{query}-internship"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    jobs = []
    for job in soup.find_all("div", class_="individual_internship"):
        try:
            # Use the correct class names based on debug output
            title_elem = job.find("h3", class_="job-internship-name")
            link_elem = job.find("a")
            
            if title_elem and link_elem:
                title = title_elem.get_text(strip=True)
                link = "https://internshala.com" + link_elem["href"]
                
                # Try to find company name - might be in different elements
                company_elem = job.find("h4") or job.find("div", class_="company") or job.find("a", class_="company")
                company = company_elem.get_text(strip=True) if company_elem else "Company not found"
                
                jobs.append({"title": title, "company": company, "url": link})
        except Exception as e:
            print(f"Error processing job: {e}")
            continue
            
    return jobs

# Test the function
jobs = scrape_internshala("data science")
print(f"Found {len(jobs)} jobs")
for job in jobs[:3]:  # Show first 3 jobs
    print(job)


Found 50 jobs
{'title': 'Social Media', 'company': 'Social MediaGriebs Music Private LimitedActively hiring', 'url': 'https://internshala.com/internship/detail/part-time-social-media-internship-in-mumbai-at-griebs-music-pvtltd1756144471'}
{'title': 'Market Research ( female only) - field work', 'company': 'Market Research ( female only) - field workVital  SynergieActively hiring', 'url': 'https://internshala.com/internship/detail/market-research-internship-in-delhi-at-vital-synergie1758521055'}
{'title': 'Motion Graphics (AI Tools)', 'company': 'Motion Graphics (AI Tools)Integral SolutionActively hiring', 'url': 'https://internshala.com/internship/detail/motion-graphics-ai-tools-internship-in-bangalore-at-integral-solution1757494564'}


In [6]:
# Enhanced scraper with better company detection and data export
import pandas as pd
import json

def scrape_internshala_enhanced(job_title, max_pages=3):
    """
    Enhanced scraper that tries multiple approaches to find company names
    and can handle multiple pages
    """
    all_jobs = []
    
    for page in range(1, max_pages + 1):
        query = job_title.replace(" ", "-")
        url = f"https://internshala.com/internships/{query}-internship"
        if page > 1:
            url += f"?page={page}"
            
        print(f"Scraping page {page}: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        job_cards = soup.find_all("div", class_="individual_internship")
        print(f"Found {len(job_cards)} jobs on page {page}")
        
        for job in job_cards:
            try:
                # Get job title
                title_elem = job.find("h3", class_="job-internship-name")
                title = title_elem.get_text(strip=True) if title_elem else "Title not found"
                
                # Get job link
                link_elem = job.find("a")
                link = "https://internshala.com" + link_elem["href"] if link_elem else "Link not found"
                
                # Try multiple approaches to find company name
                company = "Company not found"
                company_selectors = [
                    "h4",
                    "div.company",
                    "a.company", 
                    "span.company-name",
                    ".company_name",
                    "[class*='company']"
                ]
                
                for selector in company_selectors:
                    company_elem = job.select_one(selector)
                    if company_elem:
                        company = company_elem.get_text(strip=True)
                        if company and company != title:  # Make sure it's not the same as title
                            break
                
                # Extract additional info if available
                location = "Location not specified"
                duration = "Duration not specified"
                
                # Try to find location
                location_elem = job.find("span", class_="location") or job.find("div", class_="location")
                if location_elem:
                    location = location_elem.get_text(strip=True)
                
                # Try to find duration
                duration_elem = job.find("span", class_="duration") or job.find("div", class_="duration")
                if duration_elem:
                    duration = duration_elem.get_text(strip=True)
                
                all_jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "duration": duration,
                    "url": link
                })
                
            except Exception as e:
                print(f"Error processing job: {e}")
                continue
    
    return all_jobs

# Test the enhanced scraper
jobs = scrape_internshala_enhanced("data science", max_pages=2)
print(f"\nTotal jobs found: {len(jobs)}")

# Display first few jobs
for i, job in enumerate(jobs[:5]):
    print(f"\nJob {i+1}:")
    print(f"  Title: {job['title']}")
    print(f"  Company: {job['company']}")
    print(f"  Location: {job['location']}")
    print(f"  Duration: {job['duration']}")
    print(f"  URL: {job['url']}")


Scraping page 1: https://internshala.com/internships/data-science-internship
Found 51 jobs on page 1
Scraping page 2: https://internshala.com/internships/data-science-internship?page=2
Found 51 jobs on page 2

Total jobs found: 102

Job 1:
  Title: Social Media
  Company: Social MediaGriebs Music Private LimitedActively hiring
  Location: Location not specified
  Duration: Duration not specified
  URL: https://internshala.com/internship/detail/part-time-social-media-internship-in-mumbai-at-griebs-music-pvtltd1756144471

Job 2:
  Title: Market Research ( female only) - field work
  Company: Market Research ( female only) - field workVital  SynergieActively hiring
  Location: Location not specified
  Duration: Duration not specified
  URL: https://internshala.com/internship/detail/market-research-internship-in-delhi-at-vital-synergie1758521055

Job 3:
  Title: Motion Graphics (AI Tools)
  Company: Motion Graphics (AI Tools)Integral SolutionActively hiring
  Location: Location not specifi

In [7]:
# Data Export and Analysis
if jobs:
    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(jobs)
    
    print("=== JOB ANALYSIS ===")
    print(f"Total jobs scraped: {len(df)}")
    print(f"Unique companies: {df['company'].nunique()}")
    print(f"Jobs with company info: {len(df[df['company'] != 'Company not found'])}")
    
    print("\n=== TOP COMPANIES ===")
    top_companies = df['company'].value_counts().head(10)
    print(top_companies)
    
    print("\n=== SAMPLE DATA ===")
    print(df.head())
    
    # Save to files
    df.to_csv('internshala_jobs.csv', index=False)
    df.to_json('internshala_jobs.json', orient='records', indent=2)
    
    print(f"\nData saved to:")
    print("- internshala_jobs.csv")
    print("- internshala_jobs.json")
else:
    print("No jobs found to analyze")


=== JOB ANALYSIS ===
Total jobs scraped: 102
Unique companies: 51
Jobs with company info: 102

=== TOP COMPANIES ===
company
Social MediaGriebs Music Private LimitedActively hiring                    2
Creative DesignVijanX                                                      2
AI Researcher (Speech & Audio)Josh Talks                                   2
Social Media Manager & Content StrategistDuostones LLC                     2
Data AnalyticsEvoNexisActively hiring                                      2
Machine LearningEvoNexisActively hiring                                    2
Financial AnalystZean Lithos And Company Private LimitedActively hiring    2
Business AnalyticsEvoNexisActively hiring                                  2
AI Agent AutomationCalyco Paints                                           2
Data Science (Subject Matter Expert)MetaSphere Ventures                    2
Name: count, dtype: int64

=== SAMPLE DATA ===
                                         title  \
0       

In [23]:
from langchain_community.llms import Ollama
import json

# Initialize Ollama with the correct model name
ollama = Ollama(model="llama3:8b")

def clean_job_with_llm(job):
    """
    Use Llama 2 via Ollama to clean and standardize job information
    """
    raw_text = f"""
    Title: {job['title']}
    Company: {job['company']}
    Location: {job['location']}
    Duration: {job['duration']}
    URL: {job['url']}
    """

    prompt = f"""
    Clean and extract the following job info.
    - Remove extra text like 'Actively hiring'
    - Fix formatting
    - Return ONLY valid JSON with fields:
      title, company, location, duration, url

    Job info:
    {raw_text}
    """

    response = ollama.invoke(prompt)

    try:
        cleaned = json.loads(response)
        return cleaned
    except Exception:
        return {
            "error": "Could not parse JSON",
            "raw": response
        }

In [24]:
cleaned_jobs = []

for job in jobs:  # jobs from your scrape_internshala_enhanced
    cleaned = clean_job_with_llm(job)
    cleaned_jobs.append(cleaned)

# Save as CSV
import pandas as pd
df = pd.DataFrame([j for j in cleaned_jobs if isinstance(j, dict) and "error" not in j])
df.to_csv("cleaned_jobs.csv", index=False)

print("✅ Cleaned job data saved.")
print(df.head())


KeyboardInterrupt: 