In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import openpyxl

url = "https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india"

r = requests.get(url)
print(f"Status Code: {r.status_code} (200 indicates successful retrieval)")

collegenames_list = []
locations_list = []
ratings_list = []
ownerships_list = []
imp_list = []
courses_list = []
facilities_list = []
establishment_year_list = []
fees_list = []

for page_number in range(1, 179):
    page_url = f"https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page={page_number}"
    r = requests.get(page_url)
    soup = BeautifulSoup(r.text, 'lxml')
    colleges = soup.find_all('div', class_="card_block")

    # Extract college names
    for college_block in colleges:
        college_name_element = college_block.find("h3", class_="college_name d-md-none")
        collegenames_list.append(college_name_element.text.strip() if college_name_element else "NA")

    # Extract college locations
    locations = soup.find_all('div', class_="content_block d-block d-md-none")
    for location_block in locations:
        location_element = location_block.find('span')
        locations_list.append(location_element.text.strip() if location_element else "NA")

    # Extract college ratings
    ratings = soup.find_all('div', class_='block_border')
    for rating_block in ratings:
        rating_element = rating_block.find('span')
        ratings_list.append(rating_element.text.strip() if rating_element else "NA")

    # Extract college ownership
    ownerships = soup.find_all('div', class_="content_block d-block d-md-none")
    for ownership_block in ownerships:
        ownership_elements = ownership_block.find_all('span')
        if len(ownership_elements) >= 4:
            ownerships_list.append(ownership_elements[3].text.strip())
        elif len(ownership_elements) == 2:
            ownerships_list.append(ownership_elements[1].text.strip())
        else:
            ownerships_list.append("NA")

    # Extract important information snippets
    imp = soup.find_all('div', class_="snippet_block")
    for imp_block in imp:
        imp_text_element = soup.find('ul')
        imp_list.append(imp_block.text.strip() if imp_text_element else "NA")

# Extract Fees, Courses, Facilities, and Establishment Year using Regex
fee_pattern = r'₹[\d\.]+ (?:Lakhs?|K)'

for info in imp_list:
    # Extract Fees
    fee_match = re.findall(fee_pattern, info)
    fees_list.append(fee_match[0] if fee_match else "N/A")

    # Extract Courses
    course_match = re.search(course_pattern, info)
    courses_list.append(course_match.group(1) if course_match else "N/A")

    # Extract Facilities
    facilities_match = re.search(facilities_pattern, info)
    facilities_list.append(facilities_match.group(1) if facilities_match else "N/A")

    # Extract Establishment Year
    est_match = re.search(establishment_pattern, info)
    establishment_year_list.append(est_match.group(1) if est_match else "N/A")

# Create DataFrame
df = pd.DataFrame({
    "College_Name": collegenames_list,
    "Locations": locations_list,
    "Ownership": ownerships_list,  
    "Fees": fees_list,
    "Courses": courses_list,
    "Facilities": facilities_list,
    "Establishment Year": establishment_year_list,
    "Rating": ratings_list
})

print(df.head())

# Save as CSV
df.to_csv('Engineering_colleges_india_8.csv', index=False)