In [5]:
import spacy
import json

nlp = spacy.load("en_core_web_sm")

def extract_details(text):
    doc = nlp(text)
    extracted_data = {"name": "", "age": "", "school": ""}
    potential_names = []

    # Improved Name Extraction:
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            potential_names.append(ent.text)

    # Heuristic for names at the beginning or followed by verbs
    for i, token in enumerate(doc):
        if token.is_title and i < 3:  # Check first few words if title case
            potential_names.append(token.text)
        elif token.pos_ in ["VERB"] and i > 0 and doc[i-1].is_title:
            potential_names.append(doc[i-1].text)

    if potential_names:
        extracted_data["name"] = potential_names[0] 

    
    for token in doc:
        if token.is_digit and int(token.text) < 150:
            extracted_data["age"] = token.text
            break

    
    school_keywords = ["school", "college", "university", "academy", "institute","vidhyashram","mandir"]
    potential_schools = []
    for chunk in doc.noun_chunks:
        chunk_lower = chunk.text.lower()
        if any(keyword in chunk_lower for keyword in school_keywords) or "cps" in chunk_lower:
            potential_schools.append(chunk.text)

    
    if any("cps" in school.lower() for school in potential_schools) and not any("city public school" in school.lower() for school in potential_schools):
        potential_schools.append("City Public School (or CPS)")

    if potential_schools:
        extracted_data["school"] = potential_schools[0]

    return json.dumps(extracted_data, indent=2)


text1 = "John Doe, aged 30, attended City Public School."
text2 = "Jane Smith, 25 years old, went to CPS."
text3 = "Michael Brown, age 45, studied at Vidya Mandir"
text4 = "My name is Sakthi Velayutham , I have studied at CSVC school and i am 23 years old."

print(f"Text 1 Extraction:\n{extract_details(text1)}")
print(f"\nText 2 Extraction:\n{extract_details(text2)}")
print(f"\nText 3 Extraction:\n{extract_details(text3)}")
print(f"\nText 4 Extraction:\n{extract_details(text4)}")

Text 1 Extraction:
{
  "name": "John Doe",
  "age": "30",
  "school": "City Public School"
}

Text 2 Extraction:
{
  "name": "Jane Smith",
  "age": "25",
  "school": "CPS"
}

Text 3 Extraction:
{
  "name": "Michael Brown",
  "age": "45",
  "school": "Vidya Mandir"
}

Text 4 Extraction:
{
  "name": "Sakthi Velayutham",
  "age": "23",
  "school": "CSVC school"
}
