In [1]:
import pandas as pd
import numpy as np

In [15]:
import re
from collections import Counter

data = pd.read_csv("onlytext.csv")

abbreviations = {
    "st": "street", 
    "rd": "road", 
    "ave": "avenue", 
    "blvd": "boulevard", 
    "dr": "drive", 
    "ln": "lane",
    "hwy": "highway",
    "rt": "route",
    "pkwy": "parkway",
    "ct": "court",
    "s": "south",
    "e": "east",
    "w": "west",
    "n": "north"
}

def preprocess_address(address, abbreviations):
    address = address.lower()
    for abbr, full in abbreviations.items():
        address = re.sub(r'\b' + re.escape(abbr) + r'\b', full, address)
    address = re.sub(r'\s+', ' ', address).strip()
    return address
# Apply preprocessing to the dataset
data['text_processed'] = data['text'].apply(lambda x: preprocess_address(x, abbreviations))

In [16]:
data.head()

Unnamed: 0,text,text_processed
0,777 Brockton Avenue Abington MA 2351,777 brockton avenue abington ma 2351
1,30 Memorial Drive Avon MA 2322,30 memorial drive avon ma 2322
2,250 Hartford Avenue Bellingham MA 2019,250 hartford avenue bellingham ma 2019
3,700 Oak Street Brockton MA 2301,700 oak street brockton ma 2301
4,66-4 Parkhurst Rd Chelmsford MA 1824,66-4 parkhurst road chelmsford ma 1824


In [17]:
data['text_processed'].to_csv('output.csv',index = False)

In [18]:
outputdata = pd.read_csv("output.csv")

In [19]:
outputdata.head()

Unnamed: 0,text_processed
0,777 brockton avenue abington ma 2351
1,30 memorial drive avon ma 2322
2,250 hartford avenue bellingham ma 2019
3,700 oak street brockton ma 2301
4,66-4 parkhurst road chelmsford ma 1824


In [21]:
pip install fuzzywuzzy

Collecting fuzzywuzzyNote: you may need to restart the kernel to use updated packages.

  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [24]:
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def extract_features(addr1, addr2):
    addr1 = preprocess_address(addr1,abbreviations)
    addr2 = preprocess_address(addr2,abbreviations)
    
    fuzzy_score = fuzz.ratio(addr1, addr2) / 100
    
    vectorizer = TfidfVectorizer().fit_transform([addr1, addr2])
    cosine_score = cosine_similarity(vectorizer)[0, 1]
    
    set1, set2 = set(addr1.split()), set(addr2.split())
    jaccard_score = len(set1 & set2) / len(set1 | set2)
    
    zip_match = 1 if addr1.split()[-1] == addr2.split()[-1] else 0
    
    return [fuzzy_score, cosine_score, jaccard_score, zip_match]


## trying some example to test.

In [28]:
addr1 = "777 Brockton Avenue, Abington, MA 2351"
addr2 = "777 Brockton Ave, Abington, MA 2351"

features = extract_features(addr1, addr2)

# Display the extracted features
print("Features:")
print(f"Fuzzy Score: {features[0]:.2f}")
print(f"Cosine Similarity: {features[1]:.2f}")
print(f"Jaccard Similarity: {features[2]:.2f}")
print(f"ZIP Code Match: {features[3]}")


Features:
Fuzzy Score: 1.00
Cosine Similarity: 1.00
Jaccard Similarity: 1.00
ZIP Code Match: 1


In [29]:
addr1 = "123 Elm Street, Springfield, IL 62701"
addr2 = "123 Elmm St, Springfield, Illinois 62701"

features = extract_features(addr1, addr2)
print("Example 1 Features:")
print(f"Fuzzy Score: {features[0]:.2f}")
print(f"Cosine Similarity: {features[1]:.2f}")
print(f"Jaccard Similarity: {features[2]:.2f}")
print(f"ZIP Code Match: {features[3]}")
## in this we have spelling variations

Example 1 Features:
Fuzzy Score: 0.91
Cosine Similarity: 0.50
Jaccard Similarity: 0.50
ZIP Code Match: 1


In [30]:
addr1 = "456 Maple Avenue, Dayton, OH 45402"
addr2 = "456 Maple Ave, OH 45402"

features = extract_features(addr1, addr2)
print("Example 2 Features:")
print(f"Fuzzy Score: {features[0]:.2f}")
print(f"Cosine Similarity: {features[1]:.2f}")
print(f"Jaccard Similarity: {features[2]:.2f}")
print(f"ZIP Code Match: {features[3]}")
## in this we have missing components

Example 2 Features:
Fuzzy Score: 0.87
Cosine Similarity: 0.85
Jaccard Similarity: 0.83
ZIP Code Match: 1


In [31]:
addr1 = "789 Pine Lane, Orlando, FL 32801"
addr2 = "789 Pine Ln, Orlando, Florida 32810"

features = extract_features(addr1, addr2)
print("Example 3 Features:")
print(f"Fuzzy Score: {features[0]:.2f}")
print(f"Cosine Similarity: {features[1]:.2f}")
print(f"Jaccard Similarity: {features[2]:.2f}")
print(f"ZIP Code Match: {features[3]}")
## in this we have different ZIP codes

Example 3 Features:
Fuzzy Score: 0.90
Cosine Similarity: 0.50
Jaccard Similarity: 0.50
ZIP Code Match: 0


In [32]:
addr1 = "10 Downing St, London, UK"
addr2 = "10 Downing Street, London, United Kingdom"

features = extract_features(addr1, addr2)
print("Example 4 Features:")
print(f"Fuzzy Score: {features[0]:.2f}")
print(f"Cosine Similarity: {features[1]:.2f}")
print(f"Jaccard Similarity: {features[2]:.2f}")
print(f"ZIP Code Match: {features[3]}")
## in this we have add some abbreviation and extra noise.

Example 4 Features:
Fuzzy Score: 0.83
Cosine Similarity: 0.58
Jaccard Similarity: 0.57
ZIP Code Match: 0
