In [1]:
!pip install pandas fuzzywuzzy python-Levenshtein rapidfuzz geopandas geopy




# **Preprocess Addresses**
To improve matching accuracy, standardize addresses by:


Converting to lowercase.

Removing special characters.

Standardizing common terms (e.g., "Street" → "St").

In [3]:
import re

def preprocess_address(address):
    address = str(address).lower().strip()  # Convert to lowercase
    address = re.sub(r'[^a-z0-9\s]', '', address)  # Remove special characters
    replacements = {
        "street": "st", "road": "rd", "avenue": "ave",
        "boulevard": "blvd", "drive": "dr", "lane": "ln",
        "court": "ct", "place": "pl", "square": "sq"
    }
    for word, abbr in replacements.items():
        address = re.sub(rf'\b{word}\b', abbr, address)
    return address

# Apply preprocessing
master_df["clean_address"] = master_df["Address"].apply(preprocess_address)
new_df["clean_address"] = new_df["Address"].apply(preprocess_address)


In [4]:
master_df.sample(5)

Unnamed: 0,Address,Pincode,D_date,clean_address
23,A4 TO 9 GR FLR ROYAL SAND NEW LINK RD ANDHERI ...,400053,12-02-2025,a4 to 9 gr flr royal sand new link rd andheri ...
310,LAHEJJAT GLOBAL BEUFRAYES PVT LTD A 54 AGRAWAL...,400057,19-02-2025,lahejjat global beufrayes pvt ltd a 54 agrawal...
193,NEHRU ROAD VILE PARLE EAST,400057,25-02-2025,nehru rd vile parle east
229,SHOP NO 4 JEEVAN DHARA CHS LTD SHAHAJI RAJE MA...,400057,20-02-2025,shop no 4 jeevan dhara chs ltd shahaji raje ma...
3,7 MITHILA SHOPPING CENTRE V MEHTA ROADJVPD SCH...,400049,06-02-2025,7 mithila shopping centre v mehta roadjvpd sch...


In [5]:
master_df.sample(5)

Unnamed: 0,Address,Pincode,D_date,clean_address
298,SHOP NO 6 A KASHI BHUVAN SOC ANANT WAMAN VARTA...,400057,20-02-2025,shop no 6 a kashi bhuvan soc anant waman varta...
80,SHOP NO1 PERSIAN CHSLVP ROAD OPP FIDAIBAUG AND...,400058,04-02-2025,shop no1 persian chslvp rd opp fidaibaug andhe...
179,GROUND FLOOR SHOP NO 1 AND 2 PARUL CO OPHOUSIN...,400053,10-02-2025,ground floor shop no 1 and 2 parul co ophousin...
308,105 SHOP NO 1 AMRUT BHAVAN OPP AIRTEL GALLERY ...,400057,19-02-2025,105 shop no 1 amrut bhavan opp airtel gallery ...
63,14 BADRUNISA ABDUL MAJIDIRIA SOC RD VILE PARLE W,400056,06-02-2025,14 badrunisa abdul majidiria soc rd vile parle w


# **Address Matching Using Fuzzy Matching**

Fuzzy matching calculates similarity scores between two strings using the Levenshtein distance

fuzz contains functions for calculating string similarity using various methods, including the Levenshtein distance, which is the core of fuzzy matching.

In [6]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def match_address_fuzzy(address, master_addresses, threshold=80):
    best_match, score = process.extractOne(address, master_addresses, scorer=fuzz.token_sort_ratio)
    return best_match if score >= threshold else None

# Match new addresses to master dataset
new_df["matched_address"] = new_df["clean_address"].apply(lambda x: match_address_fuzzy(x, master_df["clean_address"].tolist()))

# Display results
print(new_df[["Address", "matched_address"]])


                                              Address  \
0        001 prime plaza sv roadsantacruz west mumbai   
1   MANISH NAGAR SOC ASSOCIATIONGROUND FLOOR B15 M...   
2   PLOT NO 6 KUMAR HARSHAVVARDHAN CHS JUHU VERSOV...   
3   S NO 1 BLDG NO 2 YASHODHAN APT 144 JP ROADCITY...   
4   SHOP NO 13 MANISH SHOPPING CENTRE JP RD 4 BUNG...   
..                                                ...   
81  4TH AND 5TH FLOOR  ROHAN PLAZA 5TH ROAD KHAR W...   
82  SHOP 5 6 7 8 9 EK DANT CHSL OSHIWARA LINK ROAD...   
83  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...   
84  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...   
85  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...   

                                      matched_address  
0        001 prime plaza sv roadsantacruz west mumbai  
1   manish nagar soc associationground floor b15 m...  
2   plot no 6 kumar harshavvardhan chs juhu versov...  
3   s no 1 bldg no 2 yashodhan apt 144 jp roadcity...  
4   shop no 13 manish shopping cent

# **Address Matching Using Geocoding API**

 geocoding, which converts addresses into latitude/longitude and compares the coordinates.

In [7]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

geolocator = Nominatim(user_agent="address-matcher")

# Function to get lat/lon
def get_lat_lon(address):
    try:
        location = geolocator.geocode(address, timeout=10)
        return (location.latitude, location.longitude) if location else (None, None)
    except:
        return (None, None)

# Get coordinates for addresses
master_df["coordinates"] = master_df["Address"].apply(get_lat_lon)
new_df["coordinates"] = new_df["Address"].apply(get_lat_lon)

# Function to match based on geolocation
def match_address_geo(new_coord, master_coords, max_distance_km=0.5):
    for master_addr, master_coord in zip(master_df["Address"], master_coords):
        if new_coord[0] and master_coord[0]:  # Check if valid coordinates
            if geodesic(new_coord, master_coord).km < max_distance_km:
                return master_addr
    return None

# Match using geolocation
new_df["geo_matched_address"] = new_df["coordinates"].apply(lambda x: match_address_geo(x, master_df["coordinates"].tolist()))

# Display results
print(new_df[["Address", "geo_matched_address"]])


                                              Address geo_matched_address
0        001 prime plaza sv roadsantacruz west mumbai                None
1   MANISH NAGAR SOC ASSOCIATIONGROUND FLOOR B15 M...                None
2   PLOT NO 6 KUMAR HARSHAVVARDHAN CHS JUHU VERSOV...                None
3   S NO 1 BLDG NO 2 YASHODHAN APT 144 JP ROADCITY...                None
4   SHOP NO 13 MANISH SHOPPING CENTRE JP RD 4 BUNG...                None
..                                                ...                 ...
81  4TH AND 5TH FLOOR  ROHAN PLAZA 5TH ROAD KHAR W...                None
82  SHOP 5 6 7 8 9 EK DANT CHSL OSHIWARA LINK ROAD...                None
83  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...                None
84  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...                None
85  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...                None

[86 rows x 2 columns]


**Hybrid Approach**
For best accuracy, combine fuzzy matching and geocoding:

In [8]:
def hybrid_match(address, new_coord, master_addresses, master_coords):
    fuzzy_match = match_address_fuzzy(address, master_addresses, threshold=85)
    geo_match = match_address_geo(new_coord, master_coords, max_distance_km=0.5)

    return fuzzy_match if fuzzy_match else geo_match

# Apply hybrid matching
new_df["final_matched_address"] = new_df.apply(lambda x: hybrid_match(x["clean_address"], x["coordinates"], master_df["clean_address"].tolist(), master_df["coordinates"].tolist()), axis=1)

print(new_df[["Address", "final_matched_address"]])


                                              Address  \
0        001 prime plaza sv roadsantacruz west mumbai   
1   MANISH NAGAR SOC ASSOCIATIONGROUND FLOOR B15 M...   
2   PLOT NO 6 KUMAR HARSHAVVARDHAN CHS JUHU VERSOV...   
3   S NO 1 BLDG NO 2 YASHODHAN APT 144 JP ROADCITY...   
4   SHOP NO 13 MANISH SHOPPING CENTRE JP RD 4 BUNG...   
..                                                ...   
81  4TH AND 5TH FLOOR  ROHAN PLAZA 5TH ROAD KHAR W...   
82  SHOP 5 6 7 8 9 EK DANT CHSL OSHIWARA LINK ROAD...   
83  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...   
84  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...   
85  SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...   

                                final_matched_address  
0        001 prime plaza sv roadsantacruz west mumbai  
1   manish nagar soc associationground floor b15 m...  
2   plot no 6 kumar harshavvardhan chs juhu versov...  
3   s no 1 bldg no 2 yashodhan apt 144 jp roadcity...  
4   shop no 13 manish shopping cent

In [9]:
new_df.sample(5)

Unnamed: 0,Address,Pincode,D_date,clean_address,matched_address,coordinates,geo_matched_address,final_matched_address
85,SHOP NO 8 BLDG NO B26 WELLINGTON ROADSHASTRI N...,400058,12-02-2025,shop no 8 bldg no b26 wellington roadshastri n...,shop no 8 bldg no b26 wellington roadshastri n...,"(None, None)",,shop no 8 bldg no b26 wellington roadshastri n...
75,GROUND FLOOR SHOP NO 1 AND 2 PARUL CO OPHOUSIN...,400053,10-02-2025,ground floor shop no 1 and 2 parul co ophousin...,ground floor shop no 1 and 2 parul co ophousin...,"(None, None)",,ground floor shop no 1 and 2 parul co ophousin...
74,GROUND FLOOR SHOP NO 1 AND 2 PARUL CO OPHOUSIN...,400053,10-02-2025,ground floor shop no 1 and 2 parul co ophousin...,ground floor shop no 1 and 2 parul co ophousin...,"(None, None)",,ground floor shop no 1 and 2 parul co ophousin...
39,SHOP NO 01 A GEET GOVIND BUILDING OPP SAMRATTH...,400062,07-02-2025,shop no 01 a geet govind building opp samratth...,shop no 01 a geet govind building opp samratth...,"(None, None)",,shop no 01 a geet govind building opp samratth...
21,SHOP NO 2 DADI HOUSEIRLA SOC RD VILE PARLE W,400056,06-02-2025,shop no 2 dadi houseirla soc rd vile parle w,shop no 2 dadi houseirla soc rd vile parle w,"(None, None)",,shop no 2 dadi houseirla soc rd vile parle w


In [10]:
# Filter only matched addresses (removing None values)
matched_df = new_df.dropna(subset=["final_matched_address"])

# Save to CSV file
matched_df.to_csv("matched_addresses.csv", index=False)

print("Matched addresses saved to 'matched_addresses.csv'")


Matched addresses saved to 'matched_addresses.csv'


In [15]:
import pandas as pd
from rapidfuzz import fuzz, process

# Load the CSV file
df = pd.read_csv("/content/address.csv")  # Ensure it has a column like 'Address'

def find_matching_addresses(search_terms, df, threshold=70):
    """
    Find addresses that match a given search term using fuzzy matching.

    Parameters:
    - search_terms: A tuple or list of search terms (e.g., ('Infinity Mall', 'Another Place'))
    - df: The DataFrame containing addresses
    - threshold: The similarity threshold (default: 70%)

    Returns:
    - A DataFrame of matched addresses
    """
    matches = []
    for search_term in search_terms:
        for address in df["Address"]:
            similarity = fuzz.partial_ratio(search_term.lower(), str(address).lower())
            if similarity >= threshold:
                matches.append(address)
    return pd.DataFrame(matches, columns=["Matched Address"])

# Example usage
search_keyword = ("7 bunglow", "juhuversova", "link rd", "FOUR BUNGLOW")  # Changed to tuple of strings
matched_df = find_matching_addresses(search_keyword, df)

# Save matched addresses to CSV
matched_df.to_csv("matched_add.csv", index=False)

print(f"Matched addresses saved to 'matched_add.csv'")

Matched addresses saved to 'matched_add.csv'


KEYWORD EXTRACTION

In [25]:
import pandas as pd
import re
from collections import Counter

# Load CSV file
df = pd.read_csv("/content/master.csv")

# Ensure column is named 'Address'
if "Address" not in df.columns:
    raise ValueError("The CSV file must contain a column named 'Address'.")

# **Step 1: Address Preprocessing Function**
def preprocess_address(address):
    address = str(address).lower().strip()  # Convert to lowercase
    address = re.sub(r'[^a-z0-9\s]', '', address)  # Remove special characters

    # Standardize abbreviations
    replacements = {
        "street": "st", "road": "rd",
        "boulevard": "blvd", "drive": "dr", "lane": "ln",
        "court": "ct", "place": "pl", "square": "sq",
        "s v": "sv",  "S V ROAD" : "sv rd",  # Custom rule for "S V" to "SV"
    }

    # Apply replacements using regex
    for word, abbr in replacements.items():
        address = re.sub(rf'\b{word}\b', abbr, address)  # Ensure whole word matching

    return address

# Apply preprocessing
df["Cleaned_Address"] = df["Address"].dropna().apply(preprocess_address)

# **Step 2: Extract Common Keywords Function**
def extract_common_keywords(df, min_frequency=5):
    words = []

    for address in df["Cleaned_Address"]:
        words.extend(address.split())

    word_counts = Counter(words)
    common_words = {word: count for word, count in word_counts.items() if count > min_frequency}

    return common_words

# Get common keywords
common_keywords = extract_common_keywords(df, min_frequency=5)

# Convert to DataFrame
keywords_df = pd.DataFrame(list(common_keywords.items()), columns=["Keyword", "Frequency"])

# Save to CSV
keywords_df.to_csv("common_keywords.csv", index=False)

# **Step 3: Print Results**
print("Common Keywords (appearing more than 5 times):")
print(keywords_df)
print("Saved as 'common_keywords.csv'")

Common Keywords (appearing more than 5 times):
          Keyword  Frequency
0             1st         13
1           floor         70
2              rd        272
3            juhu         37
4    mumbaimumbai         12
..            ...        ...
123       subhash          8
124         shyam         14
125         kamal         15
126          shiv          6
127         sagar          8

[128 rows x 2 columns]
Saved as 'common_keywords.csv'


code actuall

In [4]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ------- -------------------------------- 2.1/11.5 MB 11.8 MB/s eta 0:00:01
   ---------------- ----------------------- 4.7/11.5 MB 11.8 MB/s eta 0:00:01
   ------------------------ --------------- 7.1/11.5 MB 11.8 MB/s eta 0:00:01
   -------------------------------- ------- 9.4/11.5 MB 11.8 MB/s eta 0:00:01
   ---------------------------------- ----- 10.0/11.5 MB 9.7 MB/s eta 0:00:01
   ---------------------------------------- 11.5/11.5 MB 9.5 MB/s eta 0:


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:

import pandas as pd

# Load the uploaded CSV file
file_path = "D:/data for address/master.csv"
df = pd.read_csv(file_path, encoding='latin-1')

# Display basic info and the first few rows
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Address  349 non-null    object
 1   Pincode  349 non-null    int64 
 2   D_date   349 non-null    object
dtypes: int64(1), object(2)
memory usage: 8.3+ KB


(None,
                                              Address  Pincode      D_date
 0   1ST FLOOR EDENSQUARE 10TH ROAD JUHU MUMBAIMUMBAI   400049  06-02-2025
 1   1ST FLOOR EDENSQUARE 10TH ROAD JUHU MUMBAIMUMBAI   400049  06-02-2025
 2  7 MITHILA SHOPPING CENTRE V MEHTA ROADJVPD SCH...   400049  06-02-2025
 3  7 MITHILA SHOPPING CENTRE V MEHTA ROADJVPD SCH...   400049  06-02-2025
 4  S 1 Ground Flr Maharan CHS DEV Maharan Junctio...   400049  06-02-2025)

In [7]:
df.drop_duplicates(subset="Address", inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, 0 to 348
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Address  256 non-null    object
 1   Pincode  256 non-null    int64 
 2   D_date   256 non-null    object
dtypes: int64(1), object(2)
memory usage: 8.0+ KB


In [10]:
pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.12.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Downloading rapidfuzz-3.12.1-cp313-cp313-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 10.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.12.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import re
from collections import Counter
from rapidfuzz import fuzz

# Dictionary for common street abbreviations
replacements = {
    "road": "rd", "lane": "ln", "building": "bldg", "society": "soc","opposite": "opp", "S V": "SV", "M G" : "MG","S  S road" : "sv rd",
    "S  V road" : "sv rd", "m g road": "mg rd", "road no" : "rd no", "road no": "rd no",
    "V P road" : "vp rd", "sh no" : "shop no", "JP road": "JP Rd", "N S road" : "NS Rd",
    "station" : "st", "LINK ROAD" : "lnk rd", "plot no": "plt no", "number" : "no", "west": "w",
    "east": "e", "north": "n", "south": "s", "s no": "sh no", "railway":"rly", "G B road" : "GB rd" # Handling cases like "S V Road" -> "SV Road"
}

def preprocess_address(address):
    """Cleans and normalizes an address."""
    address = str(address).lower().strip()  # Convert to lowercase
    address = re.sub(r'[^a-z0-9\s]', '', address)  # Remove special characters

    for word, abbr in replacements.items():
        address = re.sub(rf'\b{word}\b', abbr, address)  # Replace with abbreviation

    return address

# Apply preprocessing to the Address column
df["clean_address"] = df["Address"].apply(preprocess_address)

def extract_common_keywords(df, min_frequency=5):
    """Extracts common words from addresses based on frequency."""
    words = []
    for address in df["clean_address"].dropna():
        words.extend(address.split())

    word_counts = Counter(words)
    common_words = [word for word, count in word_counts.items() if count > min_frequency]

    return common_words

# Extract frequently used keywords in addresses
common_keywords = extract_common_keywords(df, min_frequency=5)

# Convert to DataFrame and Save
keywords_df = pd.DataFrame(common_keywords, columns=["Keyword"])
keywords_df["Frequency"] = keywords_df["Keyword"].apply(lambda x: Counter(word for address in df["clean_address"] for word in address.split())[x])
keywords_df.to_csv("/common_keywords.csv", index=False)

# Function to find matching addresses based on a keyword
def find_matching_addresses(search_term, df, threshold=70):
    """Finds addresses that match the search term using fuzzy matching."""
    matches = []
    for address in df["clean_address"]:
        similarity = fuzz.partial_ratio(search_term.lower(), address)
        if similarity >= threshold:
            matches.append(address)

    return pd.DataFrame(matches, columns=["Matched Address"])

# Example: Search for addresses related to "sv rd"
search_keyword = "sv rd"  # Example keyword
matched_df = find_matching_addresses(search_keyword, df)

# Save results
matched_df.to_csv("matched_addresses.csv", index=False)

("common_keywords.csv", "matched_addresses.csv")


('common_keywords.csv', 'matched_addresses.csv')

In [None]:
import pandas as pd
from rapidfuzz import fuzz, process

# Load the CSV file
df = pd.read_csv("D:/data for address/master.csv")  # Ensure it has a column 'Address'

# **Step 1: Extract Unique Words from Addresses**
def extract_unique_keywords(df):
    """
    Extract unique words from the address column to create a keyword set.
    """
    unique_words = set()
    for address in df["Address"]:
        words = str(address).lower().split()  # Split words in lowercase
        unique_words.update(words)
    return unique_words

# **Step 2: Find Matching Addresses**
def find_matching_addresses(search_term, df, threshold=70):
    """
    Dynamically match addresses based on a search term.

    Parameters:
    - search_term: The word or phrase to search
    - df: The DataFrame containing addresses
    - threshold: Similarity threshold (default: 70%)

    Returns:
    - A DataFrame of matched addresses
    """
    matches = []

    for address in df["Address"]:
        similarity = fuzz.partial_ratio(search_term.lower(), str(address).lower())
        if similarity >= threshold:
            matches.append(address)

    return pd.DataFrame(matches, columns=["Matched Address"])

# **Step 3: Generate Keyword List Dynamically**
keyword_list = extract_unique_keywords(df)
print("Extracted Keywords:", keyword_list)

# **Step 4: Search for Related Addresses Dynamically**
search_keyword = input("Enter search keyword: ")  # User-defined input
matched_df = find_matching_addresses(search_keyword, df)

# **Step 5: Save matched addresses to CSV**
matched_df.to_csv("matched_addresses.csv", index=False)
print(f"Matched addresses saved to 'matched_addresses.csv'")
ma = pd.read_csv("matched_addresses.csv")
ma.head(10)

Extracted Keywords: {'6', 'bhandar', '8882378', 'vinayak', 'dn', 'stn', 'royal', 'kamabali', 'plam', 'gate', 'parkopp', '28', 'bhagwan', 'wine', 'wellington', 'music', '105', 'city', 'petit', 'kunkuwadi', '39a', 'k', 'chslvp', 'subhash', 'gharunit', 'sharda', 'thackary', 'shyamkamal', 'sunswept', 'exchange', 'mumflat', 'prale', 'pvt', 'shyam', 'gaon', 'wingabhishek', 'mumbaicity', '101112', 'newlink', '60', 'csh', 'taximen', 'aram', 'gawde', '26', 'premises', 'bank', 'govind', 'office', 'sk', 'ai', 'saurashtra', 'complex', '52', 'majidiria', 'nr', 'alankar', 'hospitalhill', 'hru', 'ruia', 'heights', 'roadand', 'sulta', 'bridge', 'co', 'shon', 'cottage', 'officesantacruz', 'jitendra', '15', 'oshiwara', '09', 'desai', 'm', 'agarwal', 'rshop', 'shoping', 'flrbldg', 'cest', 'niw', 'thakarshi', '25c', 'unit', 'shri', 'indulkarchawl', '12', 'op', 'stall', 'jp', 'rly', '106', 'chslokhandwala', 'mehata', '15b', '4000522', 'wnr', 'poddar', 'station', 'bhawan', 'chowk', 'jogeshwari', 'varsova', 

In [None]:
import pandas as pd
import re
from rapidfuzz import fuzz

# Load the CSV file
df = pd.read_csv("/content/master.csv")
# **Step 1: Extract Unique Keywords (Ignoring Single Characters)**
def extract_unique_keywords(df):
    """
    Extract unique words from addresses while avoiding single-character words.
    """
    unique_words = set()

    for address in df["Address"].dropna():  # Remove NaN values
        words = re.findall(r'\b[a-zA-Z]{2,}\b', address.lower())  # Extract words with 2+ characters
        unique_words.update(words)

    return sorted(unique_words)  # Return sorted keywords for better readability

# **Step 2: Find Matching Addresses**
def find_matching_addresses(search_term, df, threshold=70):
    """
    Match addresses dynamically based on a search term.
    """
    matches = []

    for address in df["Address"].dropna():  # Remove NaN values
        similarity = fuzz.partial_ratio(search_term.lower(), address.lower())
        if similarity >= threshold:
            matches.append(address)

    return pd.DataFrame(matches, columns=["Matched Address"])

# **Step 3: Extract Keyword List**
keyword_list = extract_unique_keywords(df)
print("Extracted Keywords:", keyword_list)

# **Step 4: Search for Related Addresses Dynamically**
search_keyword = input("Enter search keyword: ")  # User-defined input
matched_df = find_matching_addresses(search_keyword, df)

# **Step 5: Save Matched Addresses to CSV**
matched_df.to_csv("matched_addresses.csv", index=False)
print(f"Matched addresses saved to 'matched_addresses.csv'")


ma = pd.read_csv("matched_addresses.csv")
ma.head(10)

Extracted Keywords: ['aadarsh', 'abdul', 'above', 'acre', 'aditya', 'agarwal', 'agarwarl', 'agrawal', 'ai', 'airtel', 'akshay', 'alankar', 'alpha', 'ambedkar', 'ameya', 'amrut', 'anand', 'anant', 'and', 'andhari', 'andheri', 'andrestaurant', 'apartment', 'apartmentnew', 'apna', 'apprtment', 'apt', 'apts', 'aracde', 'aram', 'arcade', 'area', 'arif', 'arun', 'aruneshwar', 'as', 'ashirwad', 'associationground', 'ast', 'avenud', 'azad', 'babasaheb', 'badrunisa', 'bahar', 'bahndar', 'balaji', 'bandra', 'bank', 'bar', 'baroda', 'baroj', 'basment', 'bata', 'bazar', 'behind', 'belle', 'below', 'benchmark', 'beside', 'beufrayes', 'bhagirathi', 'bhagwan', 'bhandar', 'bharat', 'bhavan', 'bhawan', 'bhuvan', 'bhuwan', 'bld', 'bldg', 'block', 'blue', 'bmc', 'boi', 'branch', 'bridge', 'bros', 'building', 'bungalow', 'bungalows', 'bunglow', 'bunglows', 'bus', 'business', 'campus', 'cci', 'center', 'centre', 'centreoff', 'centrepremise', 'centres', 'cest', 'chandrabai', 'chapel', 'chauk', 'chawl', 'cha

Unnamed: 0,Matched Address


In [18]:
import pandas as pd
import re
from collections import Counter
from rapidfuzz import fuzz

# Load CSV file
df = pd.read_csv("/content/master.csv")

# Ensure column is named 'Address'
if "Address" not in df.columns:
    raise ValueError("The CSV file must contain an 'Address' column.")

# **Step 1: Extract Unique Keywords (Avoiding Single Characters)**
def extract_unique_keywords(df):
    """
    Extract unique words from addresses while avoiding single-character words.
    """
    unique_words = set()

    for address in df["Address"].dropna():  # Remove NaN values
        words = re.findall(r'\b[a-zA-Z]{2,}\b', address.lower())  # Extract words with 2+ characters
        unique_words.update(words)

    return sorted(unique_words)  # Return sorted list for better readability

# **Step 2: Extract Common Keywords**
def extract_common_keywords(df, min_frequency=2):
    """
    Extract words from the Address column that appear more than min_frequency times.
    """
    words = []

    for address in df["Address"].dropna():
        clean_text = re.sub(r'[^a-zA-Z\s]', '', address)  # Remove special characters
        words.extend(clean_text.lower().split())  # Convert to lowercase & split

    # Count Word Frequencies
    word_counts = Counter(words)

    # Filter Common Words (More than `min_frequency` Times)
    common_words = {word: count for word, count in word_counts.items() if count > min_frequency}

    return pd.DataFrame(common_words.items(), columns=["Keyword", "Frequency"]).sort_values(by="Frequency", ascending=False)

# **Step 3: Find Matching Addresses**
def find_matching_addresses(search_term, df, threshold=70):
    """
    Match addresses dynamically based on a search term.
    """
    matches = []

    for address in df["Address"].dropna():  # Remove NaN values
        similarity = fuzz.partial_ratio(search_term.lower(), address.lower())
        if similarity >= threshold:
            matches.append(address)

    return pd.DataFrame(matches, columns=["Matched Address"])

# **Step 4: Process Keywords & Save**
unique_keywords = extract_unique_keywords(df)
common_keywords_df = extract_common_keywords(df, min_frequency=5)

# Save Common Keywords to CSV
common_keywords_df.to_csv("common_keywords.csv", index=False)
print("Common Keywords saved as 'common_keywords.csv'.")

# **Step 5: User Input for Address Search**
search_keyword = input("Enter search keyword: ")  # User-defined input
matched_df = find_matching_addresses(search_keyword, df)

# **Step 6: Save Matched Addresses to CSV**
matched_df.to_csv("matched_addresses.csv", index=False)
print(f"Matched addresses saved to 'matched_addresses.csv'.")

# **Step 7: Print Summary**

matched_df.head(20)


Common Keywords saved as 'common_keywords.csv'.
Enter search keyword: ville parle market
Matched addresses saved to 'matched_addresses.csv'.


Unnamed: 0,Matched Address
0,FLAT NO 2 1ST AND 2ND FLOOR SAHAKAR 34 NUTAN L...
1,12 MITHILA NEAR SAHKARI BHANDAR V MROAD JUHU V...
2,14 BADRUNISA ABDUL MAJIDIRIA SOC RD VILE PARLE W
3,401 COSMOS BUILDING ABOVE MILLENNIUMTOYOTA SHO...
4,SHOP NO 2 DADI HOUSEIRLA SOC RD VILE PARLE W
5,SHOP NO 3 EMPIRE HOUSE SV ROADIRLA VILE PARLE ...
6,SHOP NO 9 ARIF MANSION IRLA SOCIETY ROAD OPP P...
7,B 103 And C 101 1ST FLOOR KAILASH CHS JUHU CHU...
8,S NO 12 JAIN SANTOSHI MATA NGR N S RD S JVPD S...
9,133 S V ROAD IRLA VILLE PARLE MUMBAI CITY


In [23]:
import re
import pandas as pd
from collections import Counter
from rapidfuzz import fuzz

# Load the CSV file
df = pd.read_csv("/content/master.csv", encoding='latin-1')  # Adjust path if needed

# Ensure column is named 'Address'
if "Address" not in df.columns:
    raise ValueError("The CSV file must contain an 'Address' column.")

# **Step 1: Define Common Street Abbreviations**
replacements = {
    "road": "rd", "lane": "ln", "building": "bldg", "society": "soc", "opposite": "opp",
    "S V": "SV", "M G": "MG", "S  S road": "sv rd", "S  V road": "sv rd", "m g road": "mg rd",
    "road no": "rd no", "V P road": "vp rd", "sh no": "shop no", "JP road": "JP Rd",
    "N S road": "NS Rd", "station": "st", "LINK ROAD": "lnk rd", "plot no": "plt no",
    "number": "no", "west": "w", "east": "e", "north": "n", "south": "s", "s no": "sh no",
    "railway": "rly", "G B road": "GB rd"
}

# **Step 2: Preprocess Addresses**
def preprocess_address(address):
    """Cleans and normalizes an address by applying replacements and removing special characters."""
    address = str(address).lower().strip()
    address = re.sub(r'[^a-z0-9\s]', '', address)  # Remove special characters
    for word, abbr in replacements.items():
        address = re.sub(rf'\b{word}\b', abbr, address)  # Replace with abbreviation
    return address

# Apply preprocessing to the Address column
df["clean_address"] = df["Address"].apply(preprocess_address)

# **Step 3: Extract Unique Keywords**
def extract_unique_keywords(df):
    """Extracts unique words from addresses for dynamic searching."""
    unique_words = set()
    for address in df["clean_address"].dropna():
        unique_words.update(address.split())  # Split words and add to set
    return sorted(unique_words)  # Sorted for better readability

# **Step 4: Extract Common Keywords**
def extract_common_keywords(df, min_frequency=5):
    """Extracts common words appearing more than `min_frequency` times."""
    words = []
    for address in df["clean_address"].dropna():
        words.extend(address.split())

    word_counts = Counter(words)
    common_words = {word: count for word, count in word_counts.items() if count > min_frequency}

    return pd.DataFrame(common_words.items(), columns=["Keyword", "Frequency"]).sort_values(by="Frequency", ascending=False)

# **Step 5: Find Matching Addresses Using Fuzzy Matching**
def find_matching_addresses(search_term, df, threshold=70):
    """Finds addresses that match the search term using fuzzy matching."""
    matches = []
    for address in df["clean_address"]:
        similarity = fuzz.partial_ratio(search_term.lower(), address)
        if similarity >= threshold:
            matches.append(address)

    return pd.DataFrame(matches, columns=["Matched Address"])

# **Step 6: Process Keywords & Save**
unique_keywords = extract_unique_keywords(df)
common_keywords_df = extract_common_keywords(df, min_frequency=5)

# Save Common Keywords to CSV
common_keywords_df.to_csv("common_keywords.csv", index=False)
print("Common Keywords saved as 'common_keywords.csv'.")

# **Step 7: User Input for Address Search**
search_keyword = input("Enter search keyword: ")  # User-defined input
matched_df = find_matching_addresses(search_keyword, df)

# **Step 8: Save Matched Addresses to CSV**
matched_df.to_csv("matched_addresses.csv", index=False)
print(f"Matched addresses saved to 'matched_addresses.csv'.")


matched_df.head(20)



Common Keywords saved as 'common_keywords.csv'.
Enter search keyword: lokhandwala
Matched addresses saved to 'matched_addresses.csv'.


Unnamed: 0,Matched Address
0,159 reaissance bldg next to sanghavi hospitall...
1,159 reaissance bldg next to sanghavi hospitall...
2,159 reaissance bldg next to sanghavi hospitall...
3,159 reaissance bldg next to sanghavi hospitall...
4,159 reaissance bldg next to sanghavi hospitall...
5,link rdphase d oshiwara infiniti mall lokhandw...
6,lokhandwala complex shop no 7 prema bldg s s r...
7,shop no 5 plam spring lokhandwala market lokh...
8,shop no 6 ground floor sunswept chslokhandwala...
9,shop 6 sunny side 355 rd 4bunglow lokhandwala ...
