In [3]:
import spacy
from transformers import BertTokenizer, BertForTokenClassification

# Load the BERT NER model
model_name = "bert-base-cased"  # Replace with the correct pre-trained BERT model name
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

# Custom Transformer model class for BERT NER
class CustomTransformerModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, doc):
        inputs = self.tokenizer(doc.text, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        doc.tensor = outputs.last_hidden_state
        return doc

# Register the custom BERT NER model within spaCy using the @Language.component decorator
@spacy.Language.component("bert")
def create_bert_model(doc):
    return CustomTransformerModel(model, tokenizer)(doc)

nlp_bert = spacy.blank("en")

def extract_entities_bert(text):
    # Process the text with the BERT NER model
    doc = nlp_bert(text)

    # Extract entities related to locations, prices, BHK count, house/flat references, and sqft
    location_entities = [ent.text for ent in doc.ents if ent.label_ == "LOC"]
    price_entities = [ent.text for ent in doc.ents if ent.label_ == "MONEY"]
    bhk_entities = [ent.text for ent in doc.ents if ent.label_ == "CARDINAL" and "BHK" in ent.text]
    house_entities = [ent.text for ent in doc.ents if ent.text.lower() == "house"]
    flat_entities = [ent.text for ent in doc.ents if ent.text.lower() == "flat"]
    sqft_entities = [ent.text for ent in doc.ents if ent.label_ == "QUANTITY" and "sqft" in ent.text.lower()]

    return location_entities, price_entities, bhk_entities, house_entities, flat_entities, sqft_entities

# Example text containing information about locations, prices, BHK count, etc.
example_text = "Check out this 2 BHK Apartment for sale in Tambaram, Chennai. This property is posted by owner and thus there is no need to pay any broker amount. This 2 BHK Apartment is perfect for a modern-day lifestyle. Tambaram is a promising location in Chennai and this is one of the finest properties in the area. Buy this Apartment for sale now. It is located on floor 0. The total number of floors in this project is 2. The property's price is Rs 42.0 L. Residents in this property pay Rs 500 towards maintenance. This property is a modern-day abode, with 840 Square feet built-up area. The unit has 2 bedro"

# Extract entities using the BERT NER model
location_entities, price_entities, bhk_entities, house_entities, flat_entities, sqft_entities = extract_entities_bert(example_text)

print("Location Entities:", location_entities)
print("Price Entities:", price_entities)
print("BHK Entities:", bhk_entities)
print("House Entities:", house_entities)
print("Flat Entities:", flat_entities)
print("sqft Entities:", sqft_entities)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Location Entities: []
Price Entities: []
BHK Entities: []
House Entities: []
Flat Entities: []
sqft Entities: []


In [8]:
# !pip install scispacy

In [9]:
# ! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [4]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")


Downloading (…)okenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

[{'entity_group': 'Sign_symptom',
  'score': 0.9999311,
  'word': 'pal',
  'start': 38,
  'end': 41},
 {'entity_group': 'Sign_symptom',
  'score': 0.90633166,
  'word': '##pitations',
  'start': 41,
  'end': 50},
 {'entity_group': 'Clinical_event',
  'score': 0.99975544,
  'word': 'follow',
  'start': 54,
  'end': 60},
 {'entity_group': 'Date',
  'score': 0.999867,
  'word': '6 months after',
  'start': 64,
  'end': 78}]

In [7]:
from transformers import pipeline

# Load the biomedical NER model using the pipeline
pipe = pipeline("ner", model="d4data/biomedical-ner-all", tokenizer="d4data/biomedical-ner-all", aggregation_strategy="simple")

# Define the text you want to analyze
text = """Check out this 2 BHK Apartment for sale in Tambaram, Chennai. This property is posted by owner and thus there is no need to pay any broker amount. This 2 BHK Apartment is perfect for a modern-day lifestyle. Tambaram is a promising location in Chennai and this is one of the finest properties in the area. Buy this Apartment for sale now. It is located on floor 0. The total number of floors in this project is 2. The property's price is Rs 42.0 L. Residents in this property pay Rs 500 towards maintenance. This property is a modern-day abode, with 840 Square feet built-up area. The unit has 2 bedro"""

# Extract entities using the biomedical NER model
results = pipe(text)

# Initialize lists to store entity texts for each category
location_entities = []
price_entities = []
bhk_entities = []
house_entities = []
flat_entities = []
sqft_entities = []

# Iterate through the results and categorize entities based on their types
for ent in results:
    entity_type = ent.get("entity", None)
    if entity_type == "LOC":
        location_entities.append(ent["word"])
    elif entity_type == "MONEY":
        price_entities.append(ent["word"])
    elif "BHK" in ent["word"]:
        bhk_entities.append(ent["word"])
    elif "house" in ent["word"].lower():
        house_entities.append(ent["word"])
    elif "flat" in ent["word"].lower():
        flat_entities.append(ent["word"])
    elif "sqft" in ent["word"].lower():
        sqft_entities.append(ent["word"])

# Print the extracted entities
print("Location Entities:", location_entities)
print("Price Entities:", price_entities)
print("BHK Entities:", bhk_entities)
print("House Entities:", house_entities)
print("Flat Entities:", flat_entities)
print("sqft Entities:", sqft_entities)


Location Entities: []
Price Entities: []
BHK Entities: []
House Entities: []
Flat Entities: []
sqft Entities: []


In [13]:
# import torch
# from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments

# # Load the tokenizer and model
# model_name = "bert-base-uncased"  # You can choose a different model if needed
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# # Load and prepare the dataset (tokenize, convert to input tensors)
# def prepare_dataset(sentences, labels, tokenizer):
#     tokenized_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
#     labels = [[label2id[label] for label in sentence_labels] for sentence_labels in labels]
#     labels = torch.tensor(labels)
#     return tokenized_inputs, labels

# # Sample data (replace this with your own dataset)
# sentences = ["This is a nice house in a good location.",
#              "A 2 BHK flat is available for a reasonable price."]
# labels = [["O", "O", "O", "B-HOUSE", "O", "O", "O", "B-LOCATION", "O"],
#           ["O", "B-BHK", "I-BHK", "I-BHK", "O", "O", "O", "B-PRICE"]]

# # Define the label-to-id mapping for NER tags
# label_list = ["O", "B-HOUSE", "B-LOCATION", "B-BHK", "I-BHK", "B-PRICE"]
# label2id = {label: i for i, label in enumerate(label_list)}
# num_labels = len(label_list)

# # Prepare the dataset
# train_inputs, train_labels = prepare_dataset(sentences, labels, tokenizer)

# # Fine-tune the model
# training_args = TrainingArguments(
#     output_dir="./ner_model",
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     save_steps=1000,
#     save_total_limit=2,
# )
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
# )
# trainer.train()

# # Save the fine-tuned model for later use
# model.save_pretrained("./ner_model")
# tokenizer.save_pretrained("./ner_model")

In [16]:
# from transformers import BertTokenizer, BertForTokenClassification, pipeline

# # Load the tokenizer and model
# tokenizer = BertTokenizer.from_pretrained("./ner_model")
# model = BertForTokenClassification.from_pretrained("./ner_model")

# # Create a NER pipeline
# ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# # Sample input text
# text = "I'm looking for a 3 BHK flat in a central location with a budget of $200,000."

# # Perform NER on the input text
# entities = ner_pipeline(text)

# # Print the detected entities
# for entity in entities:
#     print(entity)
!python -m spacy download en_core_web_trf

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl#egg=en_core_web_trf==3.0.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting en-core-web-trf==3.0.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl (459.7 MB)
Collecting spacy-transformers<1.1.0,>=1.0.0rc4 (from en-core-web-trf==3.0.0)
  Using cached spacy_transformers-1.0.6-py2.py3-none-any.whl (42 kB)
INFO: pip is looking at multiple versions of spacy-transformers to determine which version is compatible with other requirements. This could take a while.
  Using cached spa

Collecting huggingface-hub==0.0.12 (from transformers<4.10.0,>=3.4.0->spacy-transformers<1.1.0,>=1.0.0rc4->en-core-web-trf==3.0.0)
  Using cached huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1 (from transformers<4.10.0,>=3.4.0->spacy-transformers<1.1.0,>=1.0.0rc4->en-core-web-trf==3.0.0)
  Using cached tokenizers-0.10.3.tar.gz (212 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tokenizers
  Building wheel for tokenizers (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for tokenizers [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[51 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [

In [12]:
import pandas as pd
data = pd.read_csv("/home/codetrade/Downloads/CSV/Real Estate Data V21.csv")

In [13]:
data.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,Unnamed: 9
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210,Looking for a 10 BHK Independent House for sal...,6,Yes,
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580,"Property for sale in Tambaram, Chennai. This 3...",3,No,
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840,Entire Building for sale with 7 units of singl...,5,Yes,
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,


In [14]:
df = data['Description']

0    Best 4 BHK Apartment for modern-day lifestyle ...
1    Looking for a 10 BHK Independent House for sal...
2    Property for sale in Tambaram, Chennai. This 3...
3    Entire Building for sale with 7 units of singl...
4    Property for sale in Avadi, Chennai. This 2 BH...
Name: Description, dtype: object

In [21]:
import spacy
from spacy.pipeline import EntityRuler

# Load the transformer-based language model
nlp = spacy.load("en_core_web_trf")

# Define the NER patterns using the EntityRuler
patterns = [
    {"label": "LOCATION", "pattern": [{"lower": {"in": ["city", "town", "village"]}}]},
    {"label": "PRICE", "pattern": [{"lower": {"in": ["price", "cost", "budget"]}}]},
    {"label": "BHK", "pattern": [{"lower": {"in": ["bhk", "bedroom", "hall", "kitchen"]}}]},
    {"label": "PROPERTY_TYPE", "pattern": [{"lower": {"in": ["house", "flat", "apartment"]}}]},
]

ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe('entity_ruler', before="ner", config={"overwrite_ents": True})
nlp.get_pipe("entity_ruler").add_patterns(patterns)

# Text to be analyzed
text = "give me the flat"
# Process the text and extract entities
doc = nlp(text)

# Initialize a dictionary to hold entity-wise results
entities_dict = {}

# Collect entities and their labels in the dictionary
for ent in doc.ents:
    label = ent.label_
    if label not in entities_dict:
        entities_dict[label] = []
    entities_dict[label].append(ent.text)

# Print the entities
for label, entities in entities_dict.items():
    print(f"{label}: {', '.join(entities)}")


PROPERTY_TYPE: flat


In [19]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
nlp = spacy.load("en_core_web_trf")
patterns = [
    {"label": "LOCATION", "pattern": [{"lower": {"in": ["city", "town", "village"]}}]},
    {"label": "PRICE", "pattern": [{"lower": {"in": ["price", "cost", "budget"]}}]},
    {"label": "BHK", "pattern": [{"lower": {"in": ["bhk", "bedroom", "hall", "kitchen"]}}]},
    {"label": "PROPERTY_TYPE", "pattern": [{"lower": {"in": ["house", "flat", "apartment"]}}]},
]
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe('entity_ruler', before="ner", config={"overwrite_ents": True})
nlp.get_pipe("entity_ruler").add_patterns(patterns)
text = """2 BHK Apartment for sale in Chennai. This property is in Saidapet, which is a coveted investment location. This tastefully designed 2 BHK unit is among Chennai's best properties. No brokerage to be paid for this property. This 2 BHK property is posted directly by Owner. Contact now for more details. This property in Chennai is on floor 1. The total number of floors in this Apartment is 3. The price of the Apartment is Rs 58.0 L. Monthly maintenance charges come to Rs 1000. It is best suited for all kinds of families. Because this property is spacious, with a built-up area of 1090 Square feet"""
entities_dict = {}
for text in texts:
    doc = nlp(text)
    for ent in doc.ents:
        label = ent.label_
        if label not in entities_dict:
            entities_dict[label] = []
        entities_dict[label].append(ent.text)
for label, entities in entities_dict.items():
    print(f"{label}: {', '.join(entities)}")

KeyboardInterrupt: 

In [24]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
from collections import Counter

# Load the transformer-based language model
nlp = spacy.load("en_core_web_trf")

# Define the NER patterns using the EntityRuler
patterns = [
    {"label": "LOCATION", "pattern": [{"lower": {"in": ["city", "town", "village"]}}]},
    {"label": "PRICE", "pattern": [{"lower": {"in": ["price", "cost", "budget"]}}]},
    {"label": "BHK", "pattern": [{"lower": {"in": ["bhk", "bedroom", "hall", "kitchen"]}}]},
    {"label": "PROPERTY_TYPE", "pattern": [{"lower": {"in": ["house", "flat", "apartment"]}}]},
]

# Create the EntityRuler
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)

# Add the EntityRuler to the pipeline with the string name "entity_ruler"
nlp.add_pipe("entity_ruler", name="entity_ruler", before="ner", config={"overwrite_ents": True})
nlp.get_pipe("entity_ruler").add_patterns(patterns)

# Assuming 'data' is a DataFrame with a 'Description' column and it is correctly loaded
texts = data['Description']

# Process texts in batches and extract entities
batch_size = 10
entities_counter = Counter()

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    docs = list(nlp.pipe(batch_texts))

    for doc in docs:
        for ent in doc.ents:
            entities_counter[ent.label_] += 1

# Print and save the extracted entities
for label, count in entities_counter.items():
    print(f"{label}: {count}")

entities_df = pd.DataFrame(entities_counter.items(), columns=["Entity", "Count"])
entities_df.to_csv("extracted_entities.csv", index=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


CARDINAL: 64114
BHK: 28869
PROPERTY_TYPE: 34979
DATE: 6420
GPE: 46104
PRICE: 13432
MONEY: 22157
QUANTITY: 18575
ORG: 27185
FAC: 4165
LOCATION: 3926
TIME: 284
LOC: 1734
ORDINAL: 403
PERSON: 362
PERCENT: 103
NORP: 125
PRODUCT: 24
EVENT: 3
WORK_OF_ART: 6
LANGUAGE: 39
