In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

# Sample data
data = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['John Smith', 'Jon Smith', 'Johnny Smith', 'Alice Johnson'],
    'address': ['123 Main St', '123 Main Street', '123 Main St.', '456 Elm St']
})

# Combine fields for comparison
data['combined'] = data['name'] + ' ' + data['address']

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer().fit_transform(data['combined'])
tfidf_matrix = vectorizer.toarray()

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Use DBSCAN for clustering similar records
clustering = DBSCAN(eps=0.5, min_samples=2, metric='precomputed')
labels = clustering.fit_predict(1 - similarity_matrix)  # 1 - sim to convert to distance

# Attach cluster labels
data['cluster'] = labels

print(data)