In [2]:
import json
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [65]:
def scrape_pg_links(location_name, url, max_links=50, max_scrolls=30):
    print(f"\nScraping {location_name} urls")

    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get(url)
    time.sleep(5)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h2 a"))
        )

        last_height = driver.execute_script("return document.body.scrollHeight")
        for i in range(max_scrolls):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                print("Reached end of page.")
                break
            last_height = new_height

        # Extract and filter PG URLs
        pg_links = driver.find_elements(By.CSS_SELECTOR, "h2 a")
        all_urls = list(set([a.get_attribute("href") for a in pg_links if a.get_attribute("href")]))
        locality_slug = location_name.lower().replace(" ", "-")

        filtered_urls = [url for url in all_urls if locality_slug in url]
        print(f"Found {len(filtered_urls)} filtered PG URLs for {location_name}")

        return filtered_urls

    except Exception as e:
        print(f"Error scraping {location_name}: {e}")
        return []

    finally:
        driver.quit()


In [66]:
with open("locations_url.json", "r") as f:
    data = json.load(f)

locations = data["locations_urls"] 

In [67]:
all_pg_links = {}

In [68]:
all_pg_links["Madhapur"] = scrape_pg_links("Madhapur", locations["Madhapur"])


Scraping Madhapur urls
Reached end of page.
Found 42 filtered PG URLs for Madhapur


In [69]:
all_pg_links["Gachibowli"] = scrape_pg_links("Gachibowli", locations["Gachibowli"])


Scraping Gachibowli urls
Reached end of page.
Found 36 filtered PG URLs for Gachibowli


In [70]:
all_pg_links["Kondapur"] = scrape_pg_links("Kondapur", locations["Kondapur"])


Scraping Kondapur urls
Reached end of page.
Found 24 filtered PG URLs for Kondapur


In [71]:
all_pg_links["Manikonda"] = scrape_pg_links("Manikonda", locations["Manikonda"])


Scraping Manikonda urls
Reached end of page.
Found 11 filtered PG URLs for Manikonda


In [72]:
all_pg_links["Miyapur"] = scrape_pg_links("Miyapur", locations["Miyapur"])


Scraping Miyapur urls
Reached end of page.
Found 19 filtered PG URLs for Miyapur


In [73]:
all_pg_links["Begumpet"] = scrape_pg_links("Begumpet", locations["Begumpet"])


Scraping Begumpet urls
Reached end of page.
Found 10 filtered PG URLs for Begumpet


In [74]:
all_pg_links["Ameerpet"] = scrape_pg_links("Ameerpet", locations["Ameerpet"])


Scraping Ameerpet urls
Reached end of page.
Found 44 filtered PG URLs for Ameerpet


In [75]:
all_pg_links["Uppal"] = scrape_pg_links("Uppal", locations["Uppal"])


Scraping Uppal urls
Reached end of page.
Found 35 filtered PG URLs for Uppal


In [76]:
all_pg_links["Somajiguda"] = scrape_pg_links("Somajiguda", locations["Somajiguda"])


Scraping Somajiguda urls
Reached end of page.
Found 16 filtered PG URLs for Somajiguda


In [77]:
all_pg_links["Banjara Hills"] = scrape_pg_links("Banjara Hills", locations["Banjara Hills"])


Scraping Banjara Hills urls
Reached end of page.
Found 11 filtered PG URLs for Banjara Hills


In [78]:
with open("pg_urls_combined.json", "w") as f:
    json.dump(all_pg_links, f, indent=4)

# =============================================

In [6]:
def scrape_pg_link_to_df(link, location="Unknown"):
    data = []
    print(f"\nScraping PG: {link}")

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Uncomment to run without opening Chrome
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(link)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID, "categoryRoomDetails")))

        pg_name = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()

        try:
            food_facility = driver.find_element(By.CSS_SELECTOR, "#details-summary-foodIncluded").text.strip()
        except:
            food_facility = np.nan

        try:
            available_from = driver.find_element(By.CSS_SELECTOR, "#details-summary-availableFrom").text.strip()
        except:
            available_from = np.nan

        try:
            posted_on = driver.find_element(By.CSS_SELECTOR, "#details-summary-lastUpdateDate").text.strip()
        except:
            posted_on = np.nan

        try:
            gate_closing_time = driver.find_element(By.CSS_SELECTOR, "#details-summary-gateClosingTime").text.strip()
        except:
            gate_closing_time = np.nan

        try:
            parking = driver.find_element(By.CSS_SELECTOR, "#details-summary-parkingDesc").text.strip()
        except:
            parking = np.nan

        try:
            items = [a.text.strip() for a in driver.find_elements(By.CSS_SELECTOR, ".nb__1ZTzO") if a.text.strip()]
            common_amenities = ', '.join(items)
        except:
            common_amenities = np.nan

        room_blocks = driver.find_elements(By.CSS_SELECTOR, "#categoryRoomDetails > div.nb__3MF7Q")

        for block in room_blocks:
            try:
                try:
                    sharing_details = block.find_element(By.CSS_SELECTOR, ".nb__3Qdtl").text.strip()
                except:
                    sharing_details = np.nan

                try:
                    items = [a.text.strip() for a in block.find_elements(By.CSS_SELECTOR, ".roomAmenities [class*='nb__']") if a.text.strip()]
                    room_amenities = ', '.join(items)
                except:
                    room_amenities = np.nan

                data.append({
                    "pg_name": pg_name,
                    "location": location,
                    "source_url": link,
                    "sharing_details": sharing_details,
                    "room_amenities": room_amenities,
                    "common_amenities": common_amenities,
                    "food_facility": food_facility,
                    "available_from": available_from,
                    "posted_on": posted_on,
                    "gate_closing_time": gate_closing_time,
                    "parking": parking
                })

            except Exception as e:
                print(f"couldn't extract room sharing block: {e}")

    except Exception as e:
        print(f"could not load the page: {e}")

    finally:
        driver.quit()

    # Return as DataFrame
    return pd.DataFrame(data)


In [7]:
with open("pg_urls_combined.json", "r") as f:
    pg_data = json.load(f)

In [8]:
def scrape_location_links_to_df(location, pg_data):
    print(f"Scraping PGs for location: {location} ({len(pg_data[location])} links)")
    location_data = []

    for link in pg_data[location]:
        try:
            df = scrape_pg_link_to_df(link, location=location)
            if not df.empty:
                location_data.append(df)
            time.sleep(2)
        except Exception as e:
            print(f"Error scraping link:\n{link}\n{e}")

    if location_data:
        final_df = pd.concat(location_data, ignore_index=True)
        print(f"Scraped {len(final_df)} rows for {location}")
        return final_df
    else:
        print(f"No data scraped for {location}")
        return pd.DataFrame()


In [9]:
madhapur_df = scrape_location_links_to_df("Madhapur", pg_data)
madhapur_df.to_csv("pg_data_madhapur.csv", index=False)

Scraping PGs for location: Madhapur (42 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-madhapur-hyderabad-for-rs-7500/8a9fa98494d4867c0194d4b1e76f10d6/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-madhapur-hyderabad-for-rs-9000/8a9fae82793253b60179327acb7c0f7f/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-madhapur-hyderabad-for-rs-8500/8a9faf839420400f019420b169f72006/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-madhapur-hyderabad-for-rs-8000/8a9f82de765132e40176513ad4360243/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-anyone-in-madhapur-hyderabad-for-rs-9000/8a9fbc839588bd210195892559822c07/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-madhapur-hyderabad-for-rs-6500/8a9ff88285de152f0185de32613410e8/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-madhapur-hyderabad

In [10]:
Gachibowli_df = scrape_location_links_to_df("Gachibowli", pg_data)
Gachibowli_df.to_csv("pg_data_Gachibowli.csv", index=False)

Scraping PGs for location: Gachibowli (36 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-gachibowli-hyderabad-for-rs-7500/8a9f8e439053cb3f01905418051027df/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in--gachibowli-hyderabad-for-rs-25500/8a9fb6827fa6c911017fa70be8e82c6e/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-gachibowli-hyderabad-for-rs-12000/8a9f0f8287ea151a0187ea47711b1a0b/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-gachibowli-hyderabad-for-rs-10000/8a9f80836fb20059016fb2c7faea22ca/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-gachibowli-hyderabad-for-rs-7000/8a9f8f8391eeea9b0191eef7eed1036c/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-gachibowli-hyderabad-for-rs-22000/8a9fba83959e322e01959e52eab70d81/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-ga

In [11]:
Kondapur_df = scrape_location_links_to_df("Kondapur", pg_data)
Kondapur_df.to_csv("pg_data_Kondapur.csv", index=False)

Scraping PGs for location: Kondapur (24 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-kondapur-hyderabad-for-rs-6500/8a9fd68283d11f3b0183d1670a1d2d85/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-kondapur-hyderabad-for-rs-9500/8a9f88c38c8f4d0e018c9042955b7178/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-kondapur-hyderabad-for-rs-7500/ff8081816bc6167d016bc67107ac4d3d/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-kondapur-hyderabad-for-rs-8000/8a9fa0838c33d862018c33f80b061346/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-anyone-in-2nd-a-street-opp-lane-to-s-mart-super-market-raja-rajeshwara-nagar-kondapur-telangana-500084-india-hyderabad-for-rs-12000/8a9fae8395568715019556b5a284142a/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-anyone-in-kondapur-hyderabad-for-rs-null/8a9f878497ba11a50197ba4f6cb61bfc/detail

In [12]:
Manikonda_df = scrape_location_links_to_df("Manikonda", pg_data)
Manikonda_df.to_csv("pg_data_Manikonda.csv", index=False)

Scraping PGs for location: Manikonda (11 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-manikonda-hyderabad-for-rs-8000/8a9f8ec38e36a0ac018e36e007bb284d/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-manikonda-jagir-hyderabad-for-rs-9000/8a9fed828533c2bf018533d62f200957/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-manikonda-hyderabad-for-rs-6500/8a9f8e8497afc5030197afd940a10840/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-manikonda-hyderabad-for-rs-7000/ff8081816c296778016c2cfce8207097/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-manikonda-hyderabad-for-rs-7000/8a9f8ec3936c012d01936c6823fd2f82/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-manikonda-jagir-hyderabad-for-rs-7000/8a9fa5827cc1d964017cc587ce3004db/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-

In [13]:
Miyapur_df = scrape_location_links_to_df("Miyapur", pg_data)
Miyapur_df.to_csv("pg_data_Miyapur.csv", index=False)

Scraping PGs for location: Miyapur (19 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-miyapur-hyderabad-for-rs-6000/8a9f8a439209f40c01920a0cfee80643/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-miyapur-hyderabad-for-rs-5500/8a9f84438dd8e431018dd9c58acc1149/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-miyapur-hyderabad-for-rs-null/8a9fbb8395b7a40e0195b7e592101789/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-miyapur-hyderabad-for-rs-7000/ff8081816daf0b42016daf307ca81938/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-miyapur-hyderabad-for-rs-7500/8a9f8243939f12e501939f58f8081174/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-miyapur-hyderabad-for-rs-6500/8a9f8cc493ce47c70193ce9895e11e98/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-miyapur-hyderabad-for-rs-510

In [14]:
Begumpet_df = scrape_location_links_to_df("Begumpet", pg_data)
Begumpet_df.to_csv("pg_data_Begumpet.csv", index=False)

Scraping PGs for location: Begumpet (10 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-begumpet-hyderabad-for-rs-5500/8a9f924390a4e5da0190a5576be52260/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-begumpet-hyderabad-for-rs-7000/8a9fc782827d0f4101827d8ee8e057e0/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-begumpet-hyderabad-for-rs-6500/8a9fac8597d0038c0197d04fad231d07/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-begumpet-hyderabad-for-rs-7000/8a9f99838eff795d018eff9f4019156f/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-begumpet-hyderabad-for-rs-8000/8a9fb38d7ddb9b67017ddbb4d8b60fee/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in--begumpet-hyderabad-for-rs-5300/8a9f868492655cff0192658302110b4a/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-begumpet-hyderabad

In [15]:
Ameerpet_df = scrape_location_links_to_df("Ameerpet", pg_data)
Ameerpet_df.to_csv("pg_data_Ameerpet.csv", index=False)

Scraping PGs for location: Ameerpet (44 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-ameerpet-hyderabad-for-rs-5500/8a9f87038dbb47b2018dbc2644682c83/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-ameerpet-hyderabad-for-rs-8800/8a9f8bc492d25e5b0192d2bb00a01a7b/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-ameerpet--hyderabad-for-rs-8000/8a9f9484974d10cd01974d84d2f00ab6/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-ameerpet-hyderabad-for-rs-5500/ff8081816b646993016b64e5c70c68b8/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-ameerpet-hyderabad-for-rs-7500/8a9f822c760496c5017604c351930bd2/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-ameerpet-hyderabad-for-rs-5200/8a9fa982787e17e201787e32e8940921/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-ameerpet-hyderabad-

In [16]:
Uppal_df = scrape_location_links_to_df("Uppal", pg_data)
Uppal_df.to_csv("pg_data_Uppal.csv", index=False)

Scraping PGs for location: Uppal (35 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-uppal-hyderabad-for-rs-4500/8a9fae83941b877601941be3ae1a220d/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-uppal-hyderabad-for-rs-6000/8a9f8483939184660193919b0a7a086d/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-uppal-hyderabad-for-rs-8000/8a9f82bb712a381d01712ba7eac04789/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-uppal-hyderabad-for-rs-null/8a9fa78b9777277c01977758f32e1351/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-uppal-hyderabad-for-rs-4600/ff8081816db8bf01016db9180f091b56/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-uppal-hyderabad-for-rs-6500/8a9f81ba6f6bb506016f6bb6c2540011/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-uppal-hyderabad-for-rs-5000/8a9f8d8897d453

In [17]:
Somajiguda_df = scrape_location_links_to_df("Somajiguda", pg_data)
Somajiguda_df.to_csv("pg_data_Somajiguda.csv", index=False)

Scraping PGs for location: Somajiguda (16 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-somajiguda-hyderabad-for-rs-5500/8a9f8c84976cd2bc01976d239f2627ac/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-somajiguda-hyderabad-for-rs-5800/8a9f92c390cfd0260190d03171e332ce/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-somajiguda-hyderabad-for-rs-6000/ff8081816ef48c30016ef531b1460373/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-somajiguda-hyderabad-for-rs-8000/8a9f8a8393c31fea0193c3537abd0988/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-somajiguda-hyderabad-for-rs-6500/8a9f9b8278ed72e30178ee183d434ee4/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-somajiguda--hyderabad-for-rs-6500/8a9f028286ea85510186eab871fa1a9c/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-soma

In [18]:
Banjara_Hills_df = scrape_location_links_to_df("Banjara Hills", pg_data)
Banjara_Hills_df.to_csv("pg_data_Banjara_Hills.csv", index=False)

Scraping PGs for location: Banjara Hills (11 links)

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-banjara-hills-hyderabad-for-rs-8000/8a9fb7827a4c4c71017a4c711e81190c/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-anyone-in-banjara-hills-hyderabad-for-rs-8000/8a9f8b838f3e8e93018f3efc9b5a35c3/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-banjara-hills-hyderabad-for-rs-8500/8a9f917975876a60017587a3188721f3/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-boys-in-banjara-hills--hyderabad-for-rs-6500/8a9f96828542f1c401854380da2d739e/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in-banjara-hills-hyderabad-for-rs-8000/8a9f9e038ebc6a63018ebc73ce4302d6/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-hostel-for-girls-in--banjara-hills-hyderabad-for-rs-6500/8a9fb38497b5c7640197b636255b4b98/detail

Scraping PG: https://www.nobroker.in/property/pg/pg-h

In [19]:
all_dfs = [madhapur_df, Gachibowli_df, Kondapur_df, Manikonda_df, Miyapur_df, Begumpet_df, Ameerpet_df, 
 Uppal_df, Somajiguda_df, Banjara_Hills_df]

In [20]:
merged_df = pd.concat(all_dfs, ignore_index=True)

In [21]:
merged_df.to_csv("all_pg_locations_data.csv", index=False)

In [22]:
merged_df.shape

(333, 11)

In [1]:
# import pandas as pd

# # List of location names in order
# location_names = [
#     "madhapur", "Gachibowli", "Kondapur", "Manikonda", "Miyapur",
#     "Begumpet", "Ameerpet", "Uppal", "Somajiguda", "Banjara_Hills"
# ]

# # Merge them
# location_dfs = []
# for name in location_names:
#     path = f"pg_data_{name}.csv"
#     df = pd.read_csv(path)
#     location_dfs.append(df)

# # Concatenate all into one final DataFrame
# merged_location_df = pd.concat(location_dfs, ignore_index=True)

# # Save it if needed
# merged_location_df.to_csv("all_pg_locations_data_remerged.csv", index=False)


In [None]:
merged_df.to_csv("pg_data.csv", index=False)