In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter("ignore") 

### Description:
This script extracts area names from property addresses in a rental dataset for Bangalore.  

### Key Steps:
- Reads rental property data from an Excel file.
- Reads a list of areas in Bangalore, which were web-scraped from a real estate website.
- Cleans and formats addresses by removing unnecessary terms like "Bangalore East."
- Uses fuzzy matching to map each address to the closest known area from the web-scraped list.
- Assigns the best-matched area or marks it as "Area Not Found" if no match is found.
- Saves the cleaned dataset with the extracted locality names into a new Excel file.

In [2]:
df = pd.read_excel('Bangalore_House_Rent_Data.xlsx')

In [3]:
Areas_in_Bangalore = np.array(pd.read_excel('Areas_in_Bangalore_City.xlsx').iloc[:,0])

In [4]:
Areas_in_Bangalore

array(['A Dasarahalli', 'A Narayanapura', 'ACL Layout', ..., 'Yettakodi',
       'kaikondrahalli', 'lal bagh'], dtype=object)

In [5]:
areas = [area.lower().replace(' ','') for area in Areas_in_Bangalore]

In [6]:
import re

def remove_bangalore(address):
    # Removes ", Bangalore", ", Bangalore East", ", Bangalore North", etc.
    return re.sub(r",?\s*Bangalore(?:\s+\w+)?$", "", address, flags=re.IGNORECASE)

In [7]:
df['Address'] = df["Address"].str.replace(r"^in\s+", "", regex=True)
df['Cleaned_Location'] = np.array([remove_bangalore(addr) for addr in df['Address'].str.strip()])
df['Cleaned_Location'] = df['Cleaned_Location'].str.lower().str.replace(' ','')

In [8]:
from fuzzywuzzy import process

def find_best_match(area, area_list, threshold=80):
    match, score = process.extractOne(area, area_list)
    return match if score >= threshold else 'Area Not Found'

locations = []
for i in range(df['Cleaned_Location'].shape[0]):
    length = len(df['Cleaned_Location'].str.split(',')[i])
    for j in range(length+1):
        if j != 0: 
            matched_area = find_best_match(df['Cleaned_Location'].str.split(',').str.get(-j)[i], areas)
            if matched_area != 'Area Not Found':
                break
            
    if matched_area != 'Area Not Found':
        print(i , df['Address'][i] ,'->',Areas_in_Bangalore[areas.index(matched_area)])
        locations.append(Areas_in_Bangalore[areas.index(matched_area)])
    else:
        print(i , df['Address'][i] ,'->',matched_area)
        locations.append(matched_area)

0 Prestige Silver Oak, Pattandur Agrahara, Whitefield -> Whitefield
1 Sobha Amethyst, Kannamangala, Near Seegehalli, Bangalore -> Seegehalli
2 Banjara Layout, Kalkere, Bangalore -> Kalkere
3 Parvath Nivas, Basavanagara, Bangalore -> Basavanagar
4 RJ Brooke Square, Kundalahalli, Brookefield -> Brookefield
5 Adarsh Palm Retreat, Bellandur, Bangalore -> Bellandur
6 Nambiar Ellegenza, Sarjapur Road, Bangalore -> Sarjapur Road
7 Ferns Paradise, Dodda Nekkundi, Outer Ring Road East -> Outer Ring Road
8 Ferns Paradise, Dodda Nekkundi, Outer Ring Road East -> Outer Ring Road
9 KHAN FLAT, Amarjyoti Layout, Domlur, Bangalore -> Domlur
10 Adarsh Palm Retreat, Bellandur, Bangalore -> Bellandur
11 PRAVEEN FLAT, Indira Nagar Stage 2, Bangalore -> Indira Nagar
12 MRKR Mera Homes, Whitefield, Bangalore -> Whitefield
13 Adarsh Palm Meadows, Ramagondanahalli, Whitefield -> Whitefield
14 Kodathi, Near Sarjapur Road, Bangalore -> Sarjapur
15 Ashraya Lake Veiw, Malleshpalya, Kaggadasapura -> Kaggadasapura




4337 ADITI APARTMENTS, Amar Jyoti Layout , Anand Nagar,, Bangalore North, Bangalore -> Anand Nagar
4338 Vinayak Nagar, Bagalur Main Road -> Bagalur Main Road
4339 Ozone Urbana, Devanahalli, Bangalore -> Devanahalli
4340 Maranayakanahalli, Bangalore -> Dasanayakanahalli
4341 Shabarinagar RWA, RK Hegde Nagar, Thanisandra Main Road -> Thanisandra Main Road
4342 Sir MV Layout, Virupakshapura, Bangalore -> Virupakshapura
4343 Nagasandra, Bangalore -> Nagasandra
4344 Divya MSR Gateway, Gokula Extension, Mathikere, Bangalore -> Mathikere
4345 Nitesh Central Park, Vinayak Nagar, Bagalur Main Road -> Bagalur Main Road
4346 Srirampura Jakkur, Bangalore -> Jakkur
4347 Andrahalli Main road , Thigalarpalya , 40 , 1st Main 1st cross, Bangalore North, Bangalore -> Tigalarpalya
4348 Mithuna White Waters, Jakkur, Bangalore -> Jakkur
4349 Skanda Nilaya 61/5, Bangalore North, Bangalore -> Area Not Found
4350 Godrej Royale Woods, Devanahalli, Bangalore -> Devanahalli
4351 Nagasandra, Bangalore -> Nagasand



4601 72/1 Sarvodaya, D.Rajagopal Road, N.S.Halli,, Bangalore North, Bangalore -> Area Not Found
4602 Prestige Finsbury Park, Bagalur, Bangalore -> Bagaluru
4603 Prestige Finsbury Park, Bagalur, Bangalore -> Bagaluru
4604 Vinayaka Layout, Madavara, Bangalore -> Madavara
4605 Jakkur, Bangalore -> Jakkur
4606 Prestige Finsbury Park, Bagalur, Bangalore -> Bagaluru
4607 Veracious Vani Vilas, Yelahanka, Bangalore -> Yelahanka
4608 NR Windgates, Chokkanahalli, Near Thanisandra, Bangalore -> Thanisandra
4609 Trendsquares Ortus 3, Amruthahalli, Near Hebbal, Bangalore -> Hebbal
4610 JK Suchiraa Villas, Jalahalli West, BEL Road, Bangalore -> New BEL Road
4611 Godrej Royale Woods, Devanahalli, Bangalore -> Devanahalli
4612 Assetz Here and Now, Rachenahalli, Bangalore -> Rachenahalli
4613 venkataramanappa Nilya, K.Narayanapura, Thanisandra Main Road -> Thanisandra Main Road
4614 Bharatiya City Nikoo Homes, Thanisandra, Bangalore -> Thanisandra
4615 Prestige Misty Waters, Cholanayakanahalli, Hebbal 

In [9]:
df.insert(11 , 'Locality' , locations)

In [10]:
df.drop(columns = ['Cleaned_Location'] , inplace=True)

In [11]:
df.to_excel('Bangalore_House_Rent_Data_Address_Cleaned.xlsx' , index=False)