In [109]:
#  env + model config
import os, json, re, getpass
from jsonschema import Draft7Validator
import openai
import pandas as pd
pd.set_option("display.max_colwidth", 200)

# Secure API key (DO NOT hardcode; paste at runtime)
if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"]:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your GROQ API key (hidden): ").strip()

BASE_URL = os.environ.get("BASE_URL", "https://api.groq.com/openai/v1")
MODEL = "llama-3.1-8b-instant"   # change to "llama-3.1-70b" if available for final runs


In [110]:
# ---------------------------------------------------------------------------- CORE ENGINE --------------------------------------------------------------------------------------------------
import os, json, re
import openai

# Cell 2: Core extractor (replace previous versions)
schema = {
    "name": "extract_user_info",
    "description": "Extract name, email, phone, location and age from free text",
    "parameters": {
        "type": "object",
        "properties": {
            "name": {"oneOf": [{"type": "string"}, {"type": "null"}]},
            "email": {"oneOf": [{"type": "string"}, {"type": "null"}]},
            "phone": {"oneOf": [{"type": "string"}, {"type": "null"}]},
            "location": {"oneOf": [{"type": "string"}, {"type": "null"}]},
            "age": {"oneOf": [{"type": "integer"}, {"type": "null"}]}
        },
        "required": []
    }
}

# small regex helpers
_email_re = re.compile(r"[\w\.-]+@[\w\.-]+\.\w+")
_phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\d{2,4}[-.\s]?){1,4}\d{3,4}")

def _normalize_value(val):
    if val is None: return None
    if isinstance(val, str):
        s = val.strip()
        if s == "" or s.lower() in ("none","null","n/a","na"): return None
        if re.fullmatch(r"\d+", s):
            try: return int(s)
            except: return s
        return s
    if isinstance(val, (int, float)):
        return int(val) if int(val) == val else val
    return val

def _parse_json_like(obj):
    if obj is None: return {}
    if isinstance(obj, dict): return obj
    txt = str(obj)
    try:
        return json.loads(txt)
    except Exception:
        m = re.search(r"\{[\s\S]*\}", txt)
        if m:
            try: return json.loads(m.group(0))
            except: return {}
    return {}

def _clean_and_coerce(parsed):
    keys = list(schema["parameters"]["properties"].keys())
    cleaned = {k: None for k in keys}
    for k,v in parsed.items():
        if k not in keys: continue
        norm = _normalize_value(v)
        if k == "age":
            if norm is None: cleaned[k] = None
            elif isinstance(norm,int): cleaned[k] = norm
            else:
                try: cleaned[k] = int(str(norm).strip())
                except: cleaned[k] = None
        else:
            cleaned[k] = norm
    return cleaned

def _validate(cleaned):
    validator = Draft7Validator(schema["parameters"])
    errs = [e.message for e in sorted(validator.iter_errors(cleaned), key=lambda x: x.path)]
    return (len(errs)==0, errs)

def _regex_fallback(text):
    """Return any simple email/phone found (fallback)."""
    email = _email_re.search(text)
    phone = _phone_re.search(text)
    return {"email": email.group(0) if email else None, "phone": phone.group(0) if phone else None}

def _fallback_extract_text_json(chat_text, model_name, base_url, verbose=False):
    client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url=base_url)
    prompt = (
        "Extract name, email, phone, location, and age from the text. "
        "Return EXACTLY a JSON object with keys: name, email, phone, location, age. Use null for missing fields.\n\n"
        f"Text: {chat_text}\n\nJSON:"
    )
    try:
        resp = client.chat.completions.create(
            model=model_name,
            messages=[{"role":"user","content":prompt}],
            temperature=0.0,
            max_tokens=300
        )
    except Exception as e:
        if verbose: print("Fallback request error:", e)
        return {"raw": {}, "valid": False, "errors": [f"Fallback request error: {e}"], "warnings": []}
    try:
        if isinstance(resp, dict):
            content = resp.get("choices",[{}])[0].get("message",{}).get("content","") or ""
        else:
            content = getattr(resp.choices[0].message,"content","") or ""
    except Exception:
        content = ""
    parsed = _parse_json_like(content)
    cleaned = _clean_and_coerce(parsed)
    valid, errors = _validate(cleaned)
    return {"raw": cleaned, "valid": valid, "errors": errors, "warnings": []}

def extract_info(chat_text, model_name=MODEL, base_url=BASE_URL, use_fallback=True, verbose=False):
    client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url=base_url)
    warnings = []
    try:
        resp = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role":"system","content":"You are a JSON extractor. Return a function call with arguments matching the provided schema. Use null for missing fields."},
                {"role":"user","content":f"Extract name, email, phone, location, age from: {chat_text}"}
            ],
            functions=[schema],
            function_call="auto",
            temperature=0.0,
            max_tokens=300
        )
    except Exception as e:
        if verbose: print("Primary call error:", e)
        if use_fallback:
            # regex quick-fallback then text-JSON fallback
            basic = _regex_fallback(chat_text)
            fb = _fallback_extract_text_json(chat_text, model_name, base_url, verbose)
            # merge regex results if fb missing
            for k in ("email","phone"):
                if fb["raw"].get(k) is None and basic.get(k): fb["raw"][k] = basic[k]
            return fb
        return {"raw": {}, "valid": False, "errors":[f"Request error: {e}"], "warnings": []}

    # parse function_call args if present
    func_args_raw = None
    try:
        if isinstance(resp, dict):
            choice = resp.get("choices",[{}])[0]
            msg = choice.get("message",{}) or {}
            fc = msg.get("function_call")
            func_args_raw = fc.get("arguments") if fc else None
        else:
            choice = resp.choices[0]
            fc = getattr(choice.message, "function_call", None)
            func_args_raw = getattr(fc, "arguments", None) if fc else None
    except Exception as e:
        if verbose: print("Response parse error:", e)
        func_args_raw = None

    parsed = {}
    if func_args_raw:
        parsed = _parse_json_like(func_args_raw)
    else:
        # try to parse message content
        try:
            if isinstance(resp, dict):
                content = resp.get("choices",[{}])[0].get("message",{}).get("content","") or ""
            else:
                content = getattr(resp.choices[0].message,"content","") or ""
            parsed = _parse_json_like(content)
        except Exception:
            parsed = {}

    # if still empty, fallback
    if not parsed and use_fallback:
        basic = _regex_fallback(chat_text)
        fb = _fallback_extract_text_json(chat_text, model_name, base_url, verbose)
        # merge regex
        for k in ("email","phone"):
            if fb["raw"].get(k) is None and basic.get(k): fb["raw"][k] = basic[k]
        return fb

    cleaned = _clean_and_coerce(parsed)
    valid, errors = _validate(cleaned)
    return {"raw": cleaned, "valid": valid, "errors": errors, "warnings": warnings}


In [111]:
!pip install openai




In [112]:
import openai
import os

# Set API key securely using getpass or Colab secrets
# The API key should not be hardcoded directly in the notebook for security.
# Assuming the API key is set using getpass in a previous cell (0YT6Gq5lohl3)
# If you are not using getpass, you should use Colab secrets instead.

# Use Groq API base
openai.api_base = "https://api.groq.com/openai/v1"

In [113]:
conversation = []

def add_message(role, content):
    conversation.append({"role": role, "content": content})


In [114]:
# --- Demonstration for Task 1: Conversation Manager ---

# Initialize with summarization every 3rd user input
# Corrected class name and parameters to match ConversationManager definition
engine = ConversationManager(summarize_every_k=3, base_url=BASE_URL, api_key=os.environ["OPENAI_API_KEY"])

# Simulate multiple exchanges
sample_chats = [
    ("user", "Hi, I want to know about AI."),
    ("assistant", "Sure! AI stands for Artificial Intelligence."),
    ("user", "Can you also tell me about Machine Learning?"),
    ("assistant", "Yes, ML is a subset of AI that focuses on pattern recognition."),
    ("user", "What about Deep Learning?"),
    ("assistant", "Deep Learning uses neural networks with many layers."),
    ("user", "Summarize everything so far."), # This last message won't trigger summarization based on k=3 user messages
]

# Run through conversation
for role, text in sample_chats:
    engine.add(role, text) # Use the 'add' method of ConversationManager
    print(f"\n[{role.upper()}]: {text}")
    print("---- Current History ----")
    # ConversationManager stores messages in the 'messages' attribute
    print(engine.messages)


[USER]: Hi, I want to know about AI.
---- Current History ----
[{'role': 'user', 'content': 'Hi, I want to know about AI.'}]

[ASSISTANT]: Sure! AI stands for Artificial Intelligence.
---- Current History ----
[{'role': 'user', 'content': 'Hi, I want to know about AI.'}, {'role': 'assistant', 'content': 'Sure! AI stands for Artificial Intelligence.'}]

[USER]: Can you also tell me about Machine Learning?
---- Current History ----
[{'role': 'user', 'content': 'Hi, I want to know about AI.'}, {'role': 'assistant', 'content': 'Sure! AI stands for Artificial Intelligence.'}, {'role': 'user', 'content': 'Can you also tell me about Machine Learning?'}]

[ASSISTANT]: Yes, ML is a subset of AI that focuses on pattern recognition.
---- Current History ----
[{'role': 'user', 'content': 'Hi, I want to know about AI.'}, {'role': 'assistant', 'content': 'Sure! AI stands for Artificial Intelligence.'}, {'role': 'user', 'content': 'Can you also tell me about Machine Learning?'}, {'role': 'assistant'

In [115]:
# Task 1 demo: show history, truncation, summarization after k
cm = ConversationManager(model_name=MODEL, summarize_every_k=3)

# feed exchanges (6-7)
pairs = [
    ("user","Hi, I need help with my resume."),
    ("assistant","Sure, what role?"),
    ("user","Backend developer role, 2 years experience."),
    ("assistant","What technologies?"),
    ("user","Python, Flask, PostgreSQL."),
    ("assistant","Noted."),
    ("user","Please summarize my profile.")
]

for role, text in pairs:
    cm.add(role, text)
    print(f"Added {role}: {text}")
    print("History (preview):")
    for m in cm.messages:
        print("-", m["role"], ":", m["content"][:120])
    print("----\n")

# Demonstrate truncation by turns
cm2 = ConversationManager(model_name=MODEL)
for role,text in pairs*2:
    cm2.add(role,text)
print("Before truncate_by_turns:", len(cm2.messages))
cm2.truncate_by_turns(4)
print("After truncate_by_turns(4):", len(cm2.messages))

# Demonstrate truncation by chars
cm3 = ConversationManager(model_name=MODEL)
for role,text in pairs*2:
    cm3.add(role,text)
print("Chars before:", sum(len(m["content"]) for m in cm3.messages))
cm3.truncate_by_chars(120)
print("Chars after:", sum(len(m["content"]) for m in cm3.messages))


Added user: Hi, I need help with my resume.
History (preview):
- user : Hi, I need help with my resume.
----

Added assistant: Sure, what role?
History (preview):
- user : Hi, I need help with my resume.
- assistant : Sure, what role?
----

Added user: Backend developer role, 2 years experience.
History (preview):
- user : Hi, I need help with my resume.
- assistant : Sure, what role?
- user : Backend developer role, 2 years experience.
----

Added assistant: What technologies?
History (preview):
- user : Hi, I need help with my resume.
- assistant : Sure, what role?
- user : Backend developer role, 2 years experience.
- assistant : What technologies?
----

Error: API key is not set in ConversationManager.
Added user: Python, Flask, PostgreSQL.
History (preview):
- system : Conversation summary
- assistant : Summary unavailable: API key missing.
----

Added assistant: Noted.
History (preview):
- system : Conversation summary
- assistant : Summary unavailable: API key missing.
- assista

# Task 1 Demonstration: Conversation Manager
Below we simulate 6–7 exchanges between a user and an assistant to show:
- Full history retention  
- Truncation by last N messages  
- Truncation by character length  
- Summarization every k-th run (k=3 in this example)


In [116]:
def truncate_by_turns(history, n=4):
    return history[-n:]

def truncate_by_length(history, max_chars=200):
    text = " ".join([m["content"] for m in history])
    return text[:max_chars]


In [117]:
def summarize_conversation(history):
    client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url="https://api.groq.com/openai/v1")
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",  # Groq LLM - Updated model
        messages=[
            {"role": "system", "content": "Summarize the conversation concisely."},
            {"role": "user", "content": str(history)}
        ]
    )
    return response.choices[0].message.content

In [118]:
run_count = 0

def add_and_maybe_summarize(role, content, k=3):
    global run_count, conversation
    add_message(role, content)
    run_count += 1
    if run_count % k == 0:
        summary = summarize_conversation(conversation)
        conversation = [{"role": "system", "content": "Conversation summary"},
                        {"role": "assistant", "content": summary}]


In [119]:
schema = {
    "name": "extract_user_info",
    "description": "Extracts user details",
    "parameters": {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string"},
            "phone": {"type": "string"},
            "location": {"type": "string"},
            "age": {"type": "integer"}
        },
        "required": ["name", "email", "phone", "location", "age"]
    }
}


#  Task 2 Demonstration: Schema-based Extraction

Below we test the `extract_info` function using the defined schema on multiple samples.  
This shows how user inputs are classified and mapped into structured JSON output.


In [120]:
# --- Demonstration for Task 2: Schema Extraction ---
test_samples = [
    "The conference will be held on 25th September 2025 in Hyderabad.",
    "Book a flight ticket for John Doe from New York to London on October 5, 2025.",
    "My phone number is 9876543210 and email is test@example.com."
]

for i, sample in enumerate(test_samples, 1):
    print(f"\n=== Sample {i} ===")
    print("Input:", sample)
    result = extract_info(sample)
    print("Extracted JSON:", result)



=== Sample 1 ===
Input: The conference will be held on 25th September 2025 in Hyderabad.

=== Sample 2 ===
Input: Book a flight ticket for John Doe from New York to London on October 5, 2025.

=== Sample 3 ===
Input: My phone number is 9876543210 and email is test@example.com.


In [121]:
# --- Evaluation for Task 2: Extraction Quality ---
expected = [
    {"location": "Hyderabad"},
    {"name": "John Doe", "location": "New York"},
    {"phone": "9876543210", "email": "test@example.com"}
]

print("=== Evaluation ===")
correct = 0

for i, (exp, sample) in enumerate(zip(expected, test_samples), 1):
    result = extract_info(sample)
    extracted = result["raw"]

    # check overlap between expected and extracted
    match = all(extracted.get(k) == v for k, v in exp.items())
    print(f"Sample {i}: Expected {exp}, Got {extracted}, Match={match}")
    if match:
        correct += 1

print(f"\nAccuracy: {correct}/{len(expected)} = {correct/len(expected):.2%}")


=== Evaluation ===
Sample 1: Expected {'location': 'Hyderabad'}, Got {'name': None, 'email': None, 'phone': None, 'location': 'Hyderabad', 'age': None}, Match=True
Sample 2: Expected {'name': 'John Doe', 'location': 'New York'}, Got {'name': 'John Doe', 'email': None, 'phone': None, 'location': 'New York', 'age': None}, Match=True
Sample 3: Expected {'phone': '9876543210', 'email': 'test@example.com'}, Got {'name': None, 'email': 'test@example.com', 'phone': 9876543210, 'location': None, 'age': None}, Match=False

Accuracy: 2/3 = 66.67%


In [122]:
# Evaluation for the Task 2 demo samples
test_samples = [
    "The conference will be held on 25th September 2025 in Hyderabad.",
    "Book a flight ticket for John Doe from New York to London on October 5, 2025.",
    "My phone number is 9876543210 and email is test@example.com."
]

expected = [
    {"location":"Hyderabad"},
    {"name":"John Doe", "location":"New York"},  # adjust expected fields per your chosen ground-truth
    {"phone":"9876543210","email":"test@example.com"}
]

correct = 0
for i, sample in enumerate(test_samples):
    out = extract_info(sample, model_name=MODEL, use_fallback=True, verbose=False)
    got = out["raw"]
    exp = expected[i]
    match = all((got.get(k) == v) for k,v in exp.items())
    print(f"Sample {i+1} - match={match}\n  expected={exp}\n  got={got}\n  valid={out['valid']}\n")
    if match: correct += 1

print(f"Accuracy: {correct}/{len(test_samples)} = {correct/len(test_samples):.2%}")


Sample 1 - match=True
  expected={'location': 'Hyderabad'}
  got={'name': None, 'email': None, 'phone': None, 'location': 'Hyderabad', 'age': None}
  valid=False

Sample 2 - match=True
  expected={'name': 'John Doe', 'location': 'New York'}
  got={'name': 'John Doe', 'email': None, 'phone': None, 'location': 'New York', 'age': None}
  valid=False

Sample 3 - match=False
  expected={'phone': '9876543210', 'email': 'test@example.com'}
  got={'name': None, 'email': 'test@example.com', 'phone': 9876543210, 'location': None, 'age': None}
  valid=False

Accuracy: 2/3 = 66.67%


In [123]:
import os, json, re
import openai

# Schema definition outside the function for better practice
schema = {
    "name": "extract_user_info",
    "description": "Extracts user details",
    "parameters": {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string"},
            "phone": {"type": "string"},
            "location": {"type": "string"},
            "age": {"type": "integer"}
        },
        "required": []  # Make all fields optional
    }
}

def extract_info(chat):
    client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url="https://api.groq.com/openai/v1")
    parsed = {} # Initialize parsed data

    try:
        warnings = []  # store non-fatal issues here
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",  # Updated model
            messages=[{"role": "user", "content": chat}],
            functions=[schema],
            function_call={"name": "extract_user_info"}
        )
        if response.choices and response.choices[0].message.function_call:
            func_args = response.choices[0].message.function_call.arguments
            if isinstance(func_args, str):
                try:
                    parsed = json.loads(func_args)
                except json.JSONDecodeError:
                    print(f"Warning: Could not parse function arguments string as JSON: {func_args}")
                    parsed = {}
            elif isinstance(func_args, dict):
                parsed = func_args
            else:
                print(f"Warning: Unexpected type for function arguments: {type(func_args)}. Raw value: {func_args}")
                parsed = {}

    except openai.BadRequestError as e:
        print(f"An API error occurred: {e}")
        error_message = str(e)
        failed_generation_match = re.search(r"'failed_generation': '(.*?)'", error_message)
        if failed_generation_match:
            failed_generation_str = failed_generation_match.group(1)
            cleaned_json_str = re.sub(r'<function=.*?>(.*?)</function>', r'\1', failed_generation_str)
            try:
                parsed = json.loads(cleaned_json_str)
                print(f"Successfully extracted and parsed data from failed_generation: {parsed}")
            except json.JSONDecodeError:
                print(f"Warning: Could not parse failed_generation string as JSON: {cleaned_json_str}")
                parsed = {} # Set to empty dict on failure
        else:
            print("Could not find failed_generation in error details. Returning empty data.")
            parsed = {} # Set to empty dict if failed_generation is not found

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        parsed = {} # Set to empty dict for unexpected errors


    # Clean up and cast fields to correct types based on schema (applied to both success and error parsing)
    cleaned_parsed = {}
    for key, value in parsed.items():
        if key in schema["parameters"]["properties"]:
            expected_schema_type = schema["parameters"]["properties"][key]
            if key == "age" and "oneOf" in expected_schema_type and {"type": "null"} in expected_schema_type["oneOf"]:
                 # If age can be null, accept None or null from model
                 if value is None or value == "null":
                      cleaned_parsed[key] = None
                 elif isinstance(value, (int, float)):
                      cleaned_parsed[key] = int(value)
                 elif isinstance(value, str) and value.isdigit():
                      cleaned_parsed[key] = int(value)
                 elif value is not None and value != "":
                      print(f"Warning: Could not cast value '{value}' for key '{key}' to integer or null. Skipping.")
            elif expected_schema_type.get("type") == "integer":
                # Attempt to cast to integer, handle errors and empty strings for standard integers
                if isinstance(value, (int, float)):
                     cleaned_parsed[key] = int(value)
                elif isinstance(value, str) and value.isdigit():
                     cleaned_parsed[key] = int(value)
                elif value is not None and value != "":
                     print(f"Warning: Could not cast value '{value}' for key '{key}' to integer. Skipping.")
            elif "oneOf" in expected_schema_type and {"type": "string"} in expected_schema_type["oneOf"] and {"type": "null"} in expected_schema_type["oneOf"]:
                # Handle fields that can be string or null
                if value is None or value == "null":
                     cleaned_parsed[key] = None
                elif value is not None:
                     cleaned_parsed[key] = str(value)
            elif expected_schema_type.get("type") == "string":
                 # Ensure string type, convert non-string values if possible
                 if value is not None:
                      cleaned_parsed[key] = str(value)
            else:
                # Keep other types as they are or handle specifically if needed
                cleaned_parsed[key] = value
        else:
             # Include keys not in schema if desired, or skip
             cleaned_parsed[key] = value


    # Validate schema against the cleaned data
    # Note: Schema is defined inside the function, which is not ideal practice.
    # It's better to define constants like schemas outside functions.
    # For this fix, we'll keep it here to avoid modifying other cells.
    validator = Draft7Validator(schema["parameters"])
    errors = [e.message for e in sorted(validator.iter_errors(cleaned_parsed), key=lambda x: x.path)]

    return {
        "raw": cleaned_parsed,
        "valid": len(errors) == 0,
        "errors": errors,
        "warnings": warnings
    }

In [124]:
from jsonschema import Draft7Validator

# Define the MODEL variable
MODEL = "llama-3.1-8b-instant"

sample_chats = [
    "Hi, I'm Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23",
    "Hello — Priya here, priya.jobs@mail.com. I live in Bengaluru, 29 years old.",
    "Contact: Amit, phone 070-555-1234. Based in Mumbai. No email."
]

print("=== Extraction + Validation Demo ===")
for chat in sample_chats:
    # Remove use_fallback and verbose if they are not parameters of the current extract_info
    # Assuming extract_info from 3nTy9rU-pA8m is the intended one, it takes chat_text, model_name, base_url
    # Corrected function call arguments to match extract_info(chat) signature
    out = extract_info(chat)
    print("\nChat:", chat)
    print("Extracted:", out["raw"])
    print("Valid?:", out["valid"])
    if out["errors"]:
        print("Errors:", out["errors"])
    if out.get("warnings"):
        print("Warnings:", out["warnings"])

    # explicit jsonschema validation (optional double-check)
    # Ensure schema is accessible - assuming schema is defined globally or in a reachable cell
    # If schema is defined inside extract_info, this outside validation won't work.
    # For demonstration, let's assume schema is globally available or defined in a cell above this one.
    # If not, this block might cause a NameError for 'schema'.
    try:
        validator = Draft7Validator(schema["parameters"])
        val_errors = [e.message for e in sorted(validator.iter_errors(out["raw"]), key=lambda x: x.path)]
        if val_errors:
            print("Schema validator errors:", val_errors)
    except NameError:
        print("Skipping explicit jsonschema validation: 'schema' variable is not defined in this scope.")

=== Extraction + Validation Demo ===

Chat: Hi, I'm Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23
Extracted: {'age': 23, 'email': 'ravi123@gmail.com', 'location': 'Hyderabad', 'name': 'Ravi', 'phone': '9876543210'}
Valid?: True
An API error occurred: Error code: 400 - {'error': {'message': "Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=extract_user_info>{"name": "Priya", "email": "priya.jobs@mail.com", "location": "Bengaluru", "age": 29, "phone": "Not Specified", "name": "Priya"}"</function>'}}

Chat: Hello — Priya here, priya.jobs@mail.com. I live in Bengaluru, 29 years old.
Extracted: {}
Valid?: True

Chat: Contact: Amit, phone 070-555-1234. Based in Mumbai. No email.
Extracted: {'email': '', 'location': 'Mumbai', 'name': 'Amit', 'phone': '070-555-1234'}
Valid?: True


In [125]:
# Test the extraction function with a sample chat
result = extract_info(
    "Hi, my name is Ravi. My email is ravi123@gmail.com, phone is 9876543210. "
    "I live in Hyderabad and I am 23 years old."
)

print(result)

# Add more diverse and complex test cases
diverse_samples = [
    "My name is Jane Doe, email: jane.doe@example.com", # Missing phone, location, age
    "Lives in London, phone: +44 20 1234 5678", # Missing name, email, age
    "Age is 45", # Missing name, email, phone, location
    "No info here.", # No information to extract
    "Name: John Smith, Email: john.smith@web.net, Age: fifty-two", # Age as text
    "Contact: 555-1234, Location: Paris, France", # Missing name, email, age, different phone format
    "Email only: test@example.com", # Only email provided
    "Age: 30 years old, Location: Berlin", # Age with extra text
    "Name: Alice, Phone: 111-222-3333, Email: alice@mail.org, Location: Sydney, Age: 25" # All fields
]

print("\nTesting with diverse inputs:")
for s in diverse_samples:
    out = extract_info(s)
    print("\nChat:", s)
    print("Extracted:", out) # Print the raw extracted dictionary


Testing with diverse inputs:

Chat: My name is Jane Doe, email: jane.doe@example.com

Chat: Lives in London, phone: +44 20 1234 5678

Chat: Age is 45
An API error occurred: Error code: 400 - {'error': {'message': "Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=extract_user_info>{}<function>'}}

Chat: No info here.

Chat: Name: John Smith, Email: john.smith@web.net, Age: fifty-two

Chat: Contact: 555-1234, Location: Paris, France
An API error occurred: Error code: 400 - {'error': {'message': "Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=extract_user_info>{"email": "test@example.com"}<function>'}}

Chat: Email only: test@example.com
An API error occurred: Error code: 400 - {'error': {'message': "Failed t

In [126]:
# Run this first
!pip install --quiet openai jsonschema

import os, json, re, getpass
from jsonschema import Draft7Validator
import openai

# Securely set your Groq API key (in Colab, use getpass)
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your GROQ API key (hidden): ").strip()

# NOTE: we will create a *client instance* per-call below (same pattern you used)
BASE_URL = "https://api.groq.com/openai/v1"


In [127]:
# conversation storage
conversation = []

def add_message(role, content):
    conversation.append({"role": role, "content": content})

# truncation helpers
def truncate_by_turns(history_list, n):
    return history_list[-n:]

def truncate_by_chars(history_list, max_chars):
    h = history_list.copy()
    while sum(len(m['content']) for m in h) > max_chars and len(h) > 1:
        h.pop(0)
    return h

def truncate_by_words(history_list, max_words):
    h = history_list.copy()
    def total_words(msgs): return sum(len(m['content'].split()) for m in msgs)
    while total_words(h) > max_words and len(h) > 1:
        h.pop(0)
    return h


In [128]:
# quick test
conversation = []
add_message("user", "Hello")
add_message("assistant", "Hi — how can I help?")
add_message("user", "I need resume help.")
add_message("assistant", "Sure, share details.")
print("All:", conversation)
print("Last 2 turns:", truncate_by_turns(conversation, 2))
print("Chars truncated to 30:", truncate_by_chars(conversation, 30))


All: [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi — how can I help?'}, {'role': 'user', 'content': 'I need resume help.'}, {'role': 'assistant', 'content': 'Sure, share details.'}]
Last 2 turns: [{'role': 'user', 'content': 'I need resume help.'}, {'role': 'assistant', 'content': 'Sure, share details.'}]
Chars truncated to 30: [{'role': 'assistant', 'content': 'Sure, share details.'}]


In [129]:
#------------------------------------------------ ConversationManager lightweight version  ------------------------------------------------------------------------------------------
class ConversationManager:
    def __init__(self, model_name="llama-3.1-8b-instant", summarize_every_k=None, base_url=None, api_key=None):
        self.messages = []
        self.model_name = model_name
        self.summarize_every_k = summarize_every_k
        self.user_msg_count = 0
        self.base_url = base_url # Store BASE_URL as instance variable
        self.api_key = api_key # Store API key as instance variable

    def add(self, role, content):
        self.messages.append({"role": role, "content": content})
        if role == "user":
            self.user_msg_count += 1
            if self.summarize_every_k and (self.user_msg_count % self.summarize_every_k == 0):
                summary = self.summarize_history()
                # replace history with a summary message
                self.messages = [{"role":"system", "content":"Conversation summary"},
                                 {"role":"assistant","content": summary}]
                # reset user_msg_count (optional) or keep counting; here we'll keep counting

    def summarize_history(self, max_tokens=250):
        if not self.api_key:
            print("Error: API key is not set in ConversationManager.")
            return "Summary unavailable: API key missing."

        # Build a compact history text
        history_text = "\n".join(f"{m['role']}: {m['content']}" for m in self.messages)
        prompt = ("Summarize the conversation below in 3-6 short bullet points or 1 short paragraph. "
                  "Keep it concise and list any important facts (names, numbers, requests):\n\n" + history_text)
        # create client and call Groq endpoint using instance variable
        # Use the stored API key
        client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
        resp = client.chat.completions.create(
            model=self.model_name,
            messages=[{"role":"system","content":"You are a summarization assistant."},
                      {"role":"user","content":prompt}],
            max_tokens=max_tokens,
            temperature=0.1
        )
        # Extract text reliably
        try:
            text = resp.choices[0].message.content
        except Exception:
            text = getattr(resp, "output_text", str(resp))
        return text.strip()

    # Truncation methods moved inside the class
    def truncate_by_turns(self, n):
        self.messages = self.messages[-n:]

    def truncate_by_chars(self, max_chars):
        h = self.messages.copy()
        while sum(len(m['content']) for m in h) > max_chars and len(h) > 1:
            h.pop(0)
        self.messages = h

    def truncate_by_words(self, max_words):
        h = self.messages.copy()
        def total_words(msgs): return sum(len(m['content'].split()) for m in msgs)
        while total_words(h) > max_words and len(h) > 1:
            h.pop(0)
        self.messages = h

In [130]:
cm = ConversationManager(summarize_every_k=3, base_url=BASE_URL)  # summarize after every 3 user messages
cm.add("user","Hi I need resume help")
cm.add("assistant","Sure, tell me your experience")
cm.add("user","I worked on ML image classifier")
# After the 2nd user addition (3rd user? here user_count increments when role=='user') this should trigger summary
cm.add("assistant","Thanks, noted")
cm.add("user","Also need LinkedIn summary")
print("History after operations:", cm.messages)

Error: API key is not set in ConversationManager.
History after operations: [{'role': 'system', 'content': 'Conversation summary'}, {'role': 'assistant', 'content': 'Summary unavailable: API key missing.'}]


In [131]:
import os, json, re, getpass
from jsonschema import Draft7Validator
import openai  # Ensure it's imported

# Securely set your Groq API key
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your GROQ API key (hidden): ").strip()

# Define BASE_URL at the top-level so it's accessible
BASE_URL = "https://api.groq.com/openai/v1"

# Schema for function calling - Modified to allow null for all optional fields
schema = {
    "name": "extract_user_info",
    "description": "Extract name, email, phone, location and age from free text",
    "parameters": {
        "type": "object",
        "properties": {
            "name": {"oneOf": [{"type": "string"}, {"type": "null"}]}, # Allow null for name
            "email": {"oneOf": [{"type": "string"}, {"type": "null"}]}, # Allow null for email
            "phone": {"oneOf": [{"type": "string"}, {"type": "null"}]}, # Allow null for phone
            "location": {"oneOf": [{"type": "string"}, {"type": "null"}]}, # Allow null for location
            # Allow age to be either integer or null
            "age": {"oneOf": [{"type": "integer"}, {"type": "null"}]}
        },
        "required": []  # All fields are optional
    }
}

def extract_info(chat_text, model_name="llama-3.1-8b-instant", base_url=BASE_URL):
    client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url=base_url)
    parsed = {} # Initialize parsed data
    warnings = [] # Initialize warnings
    api_error = None # Variable to store API error if one occurs

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a JSON extractor that uses function-calling. Extract the user's name, email, phone, location, and age from the provided text. If a piece of information is not present in the text, omit that field from the JSON or set its value to null according to the function schema. Prioritize extracting the information accurately."}, # Refined prompt
                {"role": "user", "content": f"Text to extract from: {chat_text}"}
            ],
            functions=[schema],
            function_call={"name": "extract_user_info"},
            temperature=0.0,
            max_tokens=200
        )

        # Extract function arguments from successful response
        if response.choices and response.choices[0].message.function_call:
            func_args = response.choices[0].message.function_call.arguments
            if isinstance(func_args, str):
                try:
                    parsed = json.loads(func_args)
                except json.JSONDecodeError:
                    warnings.append(f"Could not parse function arguments string as JSON: {func_args}")
                    parsed = {} # Set to empty dict on failure
            elif isinstance(func_args, dict):
                parsed = func_args
            else:
                warnings.append(f"Unexpected type for function arguments: {type(func_args)}. Raw value: {func_args}")
                parsed = {} # Set to empty dict for unexpected types

    except openai.BadRequestError as e:
        api_error = e # Store the API error
        print(f"Caught BadRequestError: {e}")
        error_details = {"message": str(e)}
        try:
            error_response = json.loads(str(e).split(" - ")[1].strip())
            if "error" in error_response:
                error_details = error_response["error"]
        except (IndexError, json.JSONDecodeError):
            pass # Keep default message if parsing fails

        # Check specifically for the "Failed to call a function" error with empty failed_generation
        if "Failed to call a function" in str(e) and "failed_generation" in str(error_details) and not error_details.get("failed_generation", "").strip():
             print("Note: Caught 'Failed to call a function' error with empty failed_generation. Extraction failed.")
             # Return a failure result immediately for this specific error
             return {"raw": {}, "valid": False, "errors": [f"API Error: Failed to extract information. Model did not return a valid function call ({error_details.get('message', 'Unknown error')})."], "warnings": warnings}


        # If failed_generation was not empty, try to parse it from error details
        failed_generation_match = re.search(r"'failed_generation': '(.*?)'", str(error_details))
        failed_generation_str = failed_generation_match.group(1) if failed_generation_match else None

        if failed_generation_str:
             json_like_match = re.search(r"\{[\s\S]*\}", failed_generation_str)
             json_like_str = json_like_match.group(0) if json_like_match else "{}"

             try:
                 parsed = json.loads(json_like_str)
                 print(f"Successfully extracted and parsed data from failed_generation: {parsed}")
             except json.JSONDecodeError:
                  warnings.append(f"Could not parse extracted JSON-like string from failed_generation error: {json_like_str}")
                  parsed = {} # Set to empty dict on failure
        else:
             # If it was a BadRequestError but not the specific empty failed_generation,
             # and no failed_generation string was found, initialize parsed as empty.
             parsed = {}
             warnings.append(f"BadRequestError occurred without a parsable failed_generation field: {e}")


    except Exception as e:
        api_error = e # Store the unexpected error
        print(f"An unexpected error occurred: {e}")
        parsed = {} # Set to empty dict for unexpected errors
        warnings.append(f"An unexpected error occurred: {e}")


    # Clean up and cast fields to correct types based on schema (applied to both success and error parsing)
    cleaned_parsed = {}
    for key, value in parsed.items():
        if key in schema["parameters"]["properties"]:
            expected_schema_type = schema["parameters"]["properties"][key]
            if key == "age" and "oneOf" in expected_schema_type and {"type": "null"} in expected_schema_type["oneOf"]:
                 # If age can be null, accept None or null from model
                 if value is None or value == "null":
                      cleaned_parsed[key] = None
                 elif isinstance(value, (int, float)):
                      cleaned_parsed[key] = int(value)
                 elif isinstance(value, str) and value.isdigit():
                      cleaned_parsed[key] = int(value)
                 elif value is not None and value != "":
                      warnings.append(f"Could not cast value '{value}' for key '{key}' to integer or null. Skipping.")
            elif expected_schema_type.get("type") == "integer":
                # Attempt to cast to integer, handle errors and empty strings for standard integers
                if isinstance(value, (int, float)):
                     cleaned_parsed[key] = int(value)
                elif isinstance(value, str) and value.isdigit():
                     cleaned_parsed[key] = int(value)
                elif value is not None and value != "":
                     warnings.append(f"Could not cast value '{value}' for key '{key}' to integer. Skipping.")
            elif "oneOf" in expected_schema_type and {"type": "string"} in expected_schema_type["oneOf"] and {"type": "null"} in expected_schema_type["oneOf"]:
                # Handle fields that can be string or null
                if value is None or value == "null":
                     cleaned_parsed[key] = None
                elif value is not None:
                     cleaned_parsed[key] = str(value)
            elif expected_schema_type.get("type") == "string":
                 # Ensure string type, convert non-string values if possible
                 if value is not None:
                      cleaned_parsed[key] = str(value)
            else:
                # Keep other types as they are or handle specifically if needed
                cleaned_parsed[key] = value
        else:
             # Include keys not in schema if desired, or skip
             cleaned_parsed[key] = value


    # Validate schema against the cleaned data
    validator = Draft7Validator(schema["parameters"])
    errors = [e.message for e in sorted(validator.iter_errors(cleaned_parsed), key=lambda x: x.path)]

    # If there were API errors but the data is otherwise valid based on schema,
    # include the API error in the errors list for clarity.
    if api_error and not errors: # Check if an API error occurred AND there are no schema validation errors
         errors.append(f"API Error during extraction: {str(api_error)}. Extracted data is valid based on schema.")
    # Also add warnings as errors if there are no schema validation errors and no API errors
    elif not errors and not api_error and warnings:
        errors.extend(warnings)
        warnings = [] # Clear warnings if they are moved to errors


    return {
        "raw": cleaned_parsed,
        "valid": len(errors) == 0,
        "errors": errors,
        "warnings": warnings # Keep separate warnings for non-critical issues
    }


# === Test on sample inputs ===
samples = [
    "Hi, my name is Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23",
    "Hello, I'm Priya — priya.jobs@mail.com. I live in Bengaluru. Age: 29.",
    "Contact Amit — phone 070-555-1234. No email given. Based in Mumbai."
]

print("Testing with sample inputs:")
for s in samples:
    out = extract_info(s)
    print("\nChat:", s)
    print("Extracted:", out["raw"])
    print("Valid?:", out["valid"])
    if out["errors"]:
        print("Errors:", out["errors"])
    if out["warnings"]:
        print("Warnings:", out["warnings"])

Testing with sample inputs:

Chat: Hi, my name is Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23
Extracted: {'text': 'Hi, my name is Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23'}
Valid?: True

Chat: Hello, I'm Priya — priya.jobs@mail.com. I live in Bengaluru. Age: 29.
Extracted: {'age': 29, 'email': 'priya.jobs@mail.com', 'location': 'Bengaluru', 'name': 'Priya', 'phone': None}
Valid?: True

Chat: Contact Amit — phone 070-555-1234. No email given. Based in Mumbai.
Extracted: {'age': None, 'email': None, 'location': 'Mumbai', 'name': 'Amit', 'phone': '070-555-1234'}
Valid?: True


In [132]:
samples = [
    "Hi, my name is Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23",
    "Hello, I'm Priya — priya.jobs@mail.com. I live in Bengaluru. Age: 29.",
    "Contact Amit — phone 070-555-1234. No email given. Based in Mumbai."
]

for s in samples:
    out = extract_info(s)
    print("\nChat:", s)
    print("Extracted:", out["raw"])
    print("Valid?:", out["valid"])
    if out["errors"]:
        print("Validation errors:", out["errors"])



Chat: Hi, my name is Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23
Extracted: {'text': 'Hi, my name is Ravi. email: ravi123@gmail.com phone: 9876543210 location: Hyderabad age 23'}
Valid?: True

Chat: Hello, I'm Priya — priya.jobs@mail.com. I live in Bengaluru. Age: 29.
Extracted: {'age': 29, 'email': 'priya.jobs@mail.com', 'location': 'Bengaluru', 'name': 'Priya', 'phone': None}
Valid?: True

Chat: Contact Amit — phone 070-555-1234. No email given. Based in Mumbai.
Extracted: {'age': None, 'email': None, 'location': 'Mumbai', 'name': 'Amit', 'phone': '070-555-1234'}
Valid?: True


In [133]:
# Additional test cases for the extract_info function

diverse_samples = [
    "My name is Jane Doe, email: jane.doe@example.com", # Missing phone, location, age
    "Lives in London, phone: +44 20 1234 5678", # Missing name, email, age
    "Age is 45", # Missing name, email, phone, location
    "No info here.", # No information to extract
    "Name: John Smith, Email: john.smith@web.net, Age: fifty-two", # Age as text
    "Contact: 555-1234, Location: Paris, France", # Missing name, email, age, different phone format
    "Email only: test@example.com", # Only email provided
    "Age: 30 years old, Location: Berlin", # Age with extra text
    "Name: Alice, Phone: 111-222-3333, Email: alice@mail.org, Location: Sydney, Age: 25" # All fields
]

print("Testing with diverse inputs:")
for s in diverse_samples:
    out = extract_info(s) # Use the extract_info from 3nTy9rU-pA8m
    print("\nChat:", s)
    print("Extracted:", out["raw"])
    print("Valid?:", out["valid"])
    if out["errors"]:
        print("Validation errors:", out["errors"])
    if out["warnings"]:   # NEW
        print("Warnings:", out["warnings"])

Testing with diverse inputs:

Chat: My name is Jane Doe, email: jane.doe@example.com
Extracted: {'age': None, 'email': 'jane.doe@example.com', 'location': None, 'name': 'Jane Doe', 'phone': None}
Valid?: True

Chat: Lives in London, phone: +44 20 1234 5678
Extracted: {'age': None, 'email': None, 'location': 'London', 'name': None, 'phone': '+44 20 1234 5678'}
Valid?: True

Chat: Age is 45
Extracted: {'age': 45, 'email': None, 'location': None, 'name': None, 'phone': None}
Valid?: True

Chat: No info here.
Extracted: {'age': None, 'email': None, 'location': None, 'name': None, 'phone': None}
Valid?: True

Chat: Name: John Smith, Email: john.smith@web.net, Age: fifty-two
Extracted: {'age': 52, 'email': 'john.smith@web.net', 'location': None, 'name': 'John Smith', 'phone': None}
Valid?: True

Chat: Contact: 555-1234, Location: Paris, France
Extracted: {'age': None, 'email': None, 'location': 'Paris, France', 'name': None, 'phone': '555-1234'}
Valid?: True

Chat: Email only: test@example.c

In [134]:
# DEMO: Task 1 — conversation samples & k-th summarization
# Pass the API key to the ConversationManager constructor
cm = ConversationManager(model_name=MODEL, summarize_every_k=3, base_url=BASE_URL, api_key=os.environ["OPENAI_API_KEY"])

# feed sample conversation lines (user + simulated assistant replies)
samples = [
    ("user", "Hi — I want help with my resume."),
    ("assistant", "Sure — what is your experience?"),
    ("user", "I have 2 years in Python and ML internship."),
    ("assistant", "Great, any projects?"),
    ("user", "Built an image classifier project with 85% accuracy."),
    ("assistant", "Nice — metrics matter."),
]

for role, text in samples:
    cm.add(role, text)
    print("Added:", role, text)
    print("History length:", len(cm.messages))
    # Corrected: access messages attribute directly instead of calling non-existent method
    print("History preview:", cm.messages[:300]) # Displaying first 300 characters of the message list representation
    print("---")

print("\nNow demonstrate truncation:")
# rebuild a bigger history
# Pass the API key to the ConversationManager constructor
cm2 = ConversationManager(model_name=MODEL, base_url=BASE_URL, api_key=os.environ["OPENAI_API_KEY"])
for r,t in samples*3:
    cm2.add(r, t)
print("Original turns:", len(cm2.messages))
# Assuming truncate_by_turns is defined in ConversationManager and modifies in place
cm2.truncate_by_turns(4)
print("After truncate_by_turns(4): turns =", len(cm2.messages))
# char truncation
# Pass the API key to the ConversationManager constructor
cm3 = ConversationManager(model_name=MODEL, base_url=BASE_URL, api_key=os.environ["OPENAI_API_KEY"])
for r,t in samples*3:
    cm3.add(r,t)
print("Original char count:", sum(len(m['content']) for m in cm3.messages))
# Assuming truncate_by_chars is defined in ConversationManager and modifies in place
cm3.truncate_by_chars(120)
print("After truncate_by_chars(120) char count:", sum(len(m['content']) for m in cm3.messages))

Added: user Hi — I want help with my resume.
History length: 1
History preview: [{'role': 'user', 'content': 'Hi — I want help with my resume.'}]
---
Added: assistant Sure — what is your experience?
History length: 2
History preview: [{'role': 'user', 'content': 'Hi — I want help with my resume.'}, {'role': 'assistant', 'content': 'Sure — what is your experience?'}]
---
Added: user I have 2 years in Python and ML internship.
History length: 3
History preview: [{'role': 'user', 'content': 'Hi — I want help with my resume.'}, {'role': 'assistant', 'content': 'Sure — what is your experience?'}, {'role': 'user', 'content': 'I have 2 years in Python and ML internship.'}]
---
Added: assistant Great, any projects?
History length: 4
History preview: [{'role': 'user', 'content': 'Hi — I want help with my resume.'}, {'role': 'assistant', 'content': 'Sure — what is your experience?'}, {'role': 'user', 'content': 'I have 2 years in Python and ML internship.'}, {'role': 'assistant', 'content': 'Gre

# Wrap-up

## Task 1 — Conversation Management & Summarization
- Implemented ConversationManager to keep running history, truncate by turns/characters/words, and summarize every k-th user message (demoed with k=3).
- Verified with multiple simulated exchanges.

## Task 2 — JSON Schema Extraction
- Created `extract_user_info` schema and implemented `extract_info()` using Groq OpenAI-compatible client with function calling.
- Implemented robust parsing, normalization, regex fallback for phone/email, and a text-JSON fallback when function-calling fails.
- Demonstrated parsing 3+ samples and validated outputs with a small evaluation routine.

## Limitations
- Small/sparse inputs may still lead to partial extraction — fallback mitigates most cases.
- For final submission, use a larger Groq model for higher reliability.

