<a href="https://colab.research.google.com/github/Ayush-0108/Chat-Bot/blob/institute/institute.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EVENTS

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

URL = "https://iiitdwd.ac.in/events/"
headers = {
    "User-Agent": "Mozilla/5.0 (IIITDWD Bot; scraping for educational/demo purposes; contact: youremail@example.com)"
}

resp = requests.get(URL, headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')

div_main = soup.find_all('div', class_='text-card-foreground flex flex-col overflow-hidden max-w-md group hover:-translate-y-2 transition-all duration-300 hover:shadow-xl border bg-white rounded-lg shadow-sm py-0 gap-0')

event_data = []
event_id = 1  # Start from 1

for div in div_main:
    # Image
    image_div = div.find('div', class_='relative h-64 flex-none w-full')
    image_tag = image_div.find('img') if image_div else None
    image_link = image_tag['src'].strip() if image_tag and image_tag.has_attr('src') else np.nan

    # Card content
    card_content = div.find('div', class_='px-4 py-6 justify-between flex flex-col h-full')

    # Title
    title_tag = card_content.find('h2') if card_content else None
    title = title_tag.text.strip() if title_tag else np.nan

    # Date
    date_div = card_content.find('div', class_='flex text-body font-medium text-gray-500 mb-1') if card_content else None
    event_date = date_div.text.strip() if date_div else np.nan

    # Venue
    venue_div = card_content.find('div', class_='flex text-body font-medium text-gray-500') if card_content else None
    venue = venue_div.text.strip() if venue_div else np.nan

    # Description (same as title)
    description = title

    # Organizer & registration link (not present)
    organizer = np.nan
    registration_link = np.nan

    # Add row
    event_data.append([
        event_id,
        title,
        event_date,
        description,
        venue,
        organizer,
        registration_link,
        image_link
    ])
    event_id += 1

# Save to CSV
columns = ['event_id', 'title', 'event_date', 'description', 'venue', 'organizer', 'registration_link', 'image_link']
df = pd.DataFrame(event_data, columns=columns)
df.to_csv("events_data_with_images.csv", index=False)
print("Saved to events_data_with_images.csv")


Saved to events_data_with_images.csv


RECRUITEMENT

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

URL = "https://iiitdwd.ac.in/careers/"

headers = {
    "User-Agent": "Mozilla/5.0 (IIITDWD Bot; scraping for educational/demo purposes; contact: youremail@example.com)"
}

resp = requests.get(URL, headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')

div_main = soup.find('div', class_='relative w-full overflow-x-auto rounded-lg border overflow-y-hidden')
table = div_main.find('table')

rows = []
recruitment_id = 1

for tr in table.find_all('tr')[1:]:
    tds = tr.find_all('td')
    if len(tds) < 5:
        continue

    position = tds[0].get_text(strip=True) if tds[0] else np.nan
    department = tds[1].get_text(strip=True) if tds[1] else np.nan
    posting_date = tds[2].get_text(strip=True) if tds[2] else np.nan
    closing_date = tds[3].get_text(strip=True) if tds[3] else np.nan

    link_tag = tds[4].find('a')
    job_description = link_tag.get_text(strip=True) if link_tag else np.nan
    application_link = link_tag.get('href') if link_tag else np.nan

    rows.append([
        recruitment_id,
        position,
        department,
        posting_date,
        closing_date,
        job_description,
        application_link
    ])
    recruitment_id += 1

columns = [
    'recruitment_id',
    'position',
    'department',
    'posting_date',
    'closing_date',
    'job_description',
    'application_link'
]

df = pd.DataFrame(rows, columns=columns)
df.to_csv("recruitments_data.csv", index=False)
print("Saved to recruitments_data.csv")


Saved to recruitments_data.csv


CONTACTS

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

URL = "https://iiitdwd.ac.in/contact/"

headers = {
    "User-Agent": "Mozilla/5.0 (IIITDWD Bot; scraping for educational/demo purposes; contact: youremail@example.com)"
}

resp = requests.get(URL, headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')

div_main = soup.find('div', class_='w-full text-title-3 font-normal max-w-xl md:max-w-3xl xl:max-w-5xl mx-auto px-4 md:px-8')

page_id = 1
heading = div_main.h1.text.strip() if div_main.h1 else np.nan
description = div_main.div.text.strip() if div_main.div else np.nan

a_tags = div_main.find_all('a', class_='text-main underline')
email_link = a_tags[0].get('href') if len(a_tags) > 0 else np.nan
map_link = a_tags[1].get('href') if len(a_tags) > 1 else np.nan

data = [[page_id, heading, description, email_link, map_link]]

columns = ['page_id', 'heading', 'description', 'email_link', 'map_link']
df = pd.DataFrame(data, columns=columns)
df.to_csv("contact_page.csv", index=False)

print("Saved to contact_page.csv")


Saved to contact_page.csv


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

URL = "https://iiitdwd.ac.in/amenities/"
headers = {
    "User-Agent": "Mozilla/5.0 (IIITDWD Bot; scraping for educational/demo purposes; contact: youremail@example.com)"
}

resp = requests.get(URL, headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')

divs_main = soup.find_all('div', class_='flex-1 outline-none space-y-4')

data = []
infra_id = 1

for div_main in divs_main:
    divs = div_main.find_all('div', class_='bg-white text-card-foreground flex flex-col gap-6 rounded-xl border py-6 shadow-sm')

    for div in divs:
        card_header_tag = div.find('div', class_='@container/card-header grid auto-rows-min grid-rows-[auto_auto] items-start gap-1.5 px-6 has-[data-slot=card-action]:grid-cols-[1fr_auto] [.border-b]:pb-6')
        card_title = card_header_tag.find('div', class_='leading-none !text-title-1 font-semibold text-main flex items-center gap-2')
        card_description = card_header_tag.find('div', class_='text-muted-foreground !text-title-3 font-normal')

        card_content_para = div.find('div', class_='px-6 space-y-6 text-title-3 font-medium')
        description = card_content_para.p.text.strip() if card_content_para.p else np.nan

        card_content = card_content_para.find_all('div', class_='bg-background p-4 rounded-lg')

        facilities_list = []
        location = np.nan

        for content in card_content:
            header = content.find('h3', class_='font-semibold text-title-2 text-main mb-4')
            header_text = header.text.strip() if header else ''

            if "location" in header_text.lower():
                location = content.text.replace(header_text, "").strip()
            else:
                ul_tag = content.find('ul')
                if ul_tag:
                    li_tags = ul_tag.find_all('li')
                    for li in li_tags:
                        facilities_list.append(li.text.strip())

        facilities = ", ".join(facilities_list) if facilities_list else np.nan

        # Attempt to extract image URL
        image_div = div.find('img')
        image_url = image_div['src'] if image_div and image_div.has_attr('src') else np.nan

        name = card_title.text.strip() if card_title else np.nan
        infra_type = card_description.text.strip() if card_description else np.nan

        data.append([
            infra_id,
            name,
            infra_type,
            description,
            location,
            facilities,
            image_url
        ])
        infra_id += 1

columns = ['infrastructure_id', 'name', 'type', 'description', 'location', 'facilities', 'image_url']
df = pd.DataFrame(data, columns=columns)
df.to_csv("infrastructure.csv", index=False)

print("Saved to infrastructure.csv")


Saved to infrastructure.csv
