In [None]:
# Install required libraries
!pip install --quiet openai==1.0.0 jsonschema pandas
!pip install groq

Collecting groq
  Downloading groq-0.31.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.31.1-py3-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.31.1


In [None]:
# Groq free API key
from getpass import getpass
import os

os.environ["GROQ_API_KEY"] = "API KEY HERE"

# Initialize Groq client
from groq import Groq
client = Groq(api_key=os.environ["GROQ_API_KEY"])
resp = client.chat.completions.create(
    model="openai/gpt-oss-20b",
    messages=[{"role": "user", "content": "Say hi in 3 words"}]
)
print(resp.choices[0].message.content)
print("Groq client initialized.")


Hello, dear friend.
Groq client initialized.


In [None]:
import time, re
from typing import List, Dict, Optional
import json
from jsonschema import validate, ValidationError, FormatChecker

# **TASK-1**

In [None]:
# ConversationManager
class ConversationManager:
    def __init__(self,
                 model: str = "openai/gpt-oss-20b",
                 summarization_every_k: int = 3,
                 summary_max_tokens: int = 200,
                 temperature: float = 0.0):
        self.history: List[Dict] = []   # list of {"role","content","ts"}
        self.summaries: List[Dict] = [] # archived summary blocks
        self.run_count = 0
        self.k = summarization_every_k
        self.model = model
        self.summary_max_tokens = summary_max_tokens
        self.temperature = temperature

    def add_message(self, role: str, content: str):
        self.history.append({"role": role, "content": content, "ts": time.time()})
        if role == "user":
            self.run_count += 1
            if self.k > 0 and (self.run_count % self.k) == 0:
                # periodic summarization
                summary = self.summarize_history()
                self._archive_summary(summary)

    def get_history(self, trunc_by: str = "turns", limit: int = 20):
        if trunc_by == "turns":
            return self.history[-limit:]
        elif trunc_by == "chars":
            out, total = [], 0
            for msg in reversed(self.history):
                if total + len(msg["content"]) > limit:
                    break
                out.append(msg); total += len(msg["content"])
            return list(reversed(out))
        elif trunc_by == "words":
            out, total = [], 0
            for msg in reversed(self.history):
                wc = len(msg["content"].split())
                if total + wc > limit:
                    break
                out.append(msg); total += wc
            return list(reversed(out))
        else:
            raise ValueError("trunc_by must be 'turns'|'chars'|'words'")

    def summarize_history(self) -> str:
        # Build blob — prefer to summarize full history (but could be partial)
        blob = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in self.history])
        prompt = (
            "Summarize the following conversation into concise bullet points. "
            "Preserve action items, names, requests, and decisions. Keep it short.\n\n"
            + blob
        )

        try:
            resp = client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a concise summarizer."},
                    {"role": "user", "content": prompt}
                ],
                temperature=self.temperature,
                max_tokens=self.summary_max_tokens
            )
            # resp may be attribute-style or dict-like
            # Attempt to retrieve text robustly:
            choice = None
            try:
                choice = resp.choices[0].message
                summary_text = choice.content
            except Exception:
                # dict-like fallback
                if isinstance(resp, dict):
                    summary_text = resp["choices"][0]["message"]["content"]
                else:
                    summary_text = str(resp)
            return summary_text.strip()
        except Exception as e:
            # fallback small heuristic summary
            return " | ".join([m["content"][:140].replace("\n"," ") for m in self.history[-6:]])

    def _archive_summary(self, summary_text: str):
        # Archive summary message and keep only recent few messages to retain clarity
        recent = self.get_history(trunc_by="turns", limit=4)
        archived_block = {"role": "system", "content": f"[ARCHIVED SUMMARY]\n{summary_text}", "ts": time.time()}
        self.summaries.append(archived_block)
        self.history = [archived_block] + recent

    def show_history(self):
        print("=== Conversation History (most recent first) ===")
        for i, m in enumerate(self.history):
            print(f"{i+1:02d} | {m['role']:8} | {m['content'][:200]}{'...' if len(m['content'])>200 else ''}")

In [None]:
# Demo
cm = ConversationManager(summarization_every_k=3, model="openai/gpt-oss-20b", summary_max_tokens=150)

# Feed sample conversation
samples = [
    ("user", "Hi, I'm Alice. I want to book a meeting next week."),
    ("assistant", "Sure Alice — what days work for you?"),
    ("user", "I prefer Mon or Wed at 10 AM."),
    ("assistant", "Noted; sending invite to alice@example.com. Anything else?"),
    ("user", "Yes, please CC Bob (bob@work.com)."),
    ("assistant", "Added Bob. Meeting scheduled; agenda: project sync."),
    ("user", "Also what's the budget for Q4?"),
]

for role, text in samples:
    cm.add_message(role, text)

# Show compressed history (after periodic summarization)
cm.show_history()

print("\n--- Truncation examples ---")
print("\nLast 3 turns:")
for m in cm.get_history(trunc_by="turns", limit=3):
    print(m)

print("\nLimit by 120 chars:")
for m in cm.get_history(trunc_by="chars", limit=120):
    print(m)

=== Conversation History (most recent first) ===
01 | system   | [ARCHIVED SUMMARY]
- Alice wants to book a meeting next week, preferring Monday or Wednesday at 10 AM.  
- Assistant will send an invite to alice@example.com.  
- Alice requests Bob (bob@work.com) to ...
02 | assistant | Sure Alice — what days work for you?
03 | user     | I prefer Mon or Wed at 10 AM.
04 | assistant | Noted; sending invite to alice@example.com. Anything else?
05 | user     | Yes, please CC Bob (bob@work.com).
06 | assistant | Added Bob. Meeting scheduled; agenda: project sync.
07 | user     | Also what's the budget for Q4?

--- Truncation examples ---

Last 3 turns:
{'role': 'user', 'content': 'Yes, please CC Bob (bob@work.com).', 'ts': 1757838648.7545955}
{'role': 'assistant', 'content': 'Added Bob. Meeting scheduled; agenda: project sync.', 'ts': 1757838649.1862924}
{'role': 'user', 'content': "Also what's the budget for Q4?", 'ts': 1757838649.1862936}

Limit by 120 chars:
{'role': 'user', 'content': '

# **TASK-2**

In [None]:
# function schema
extract_function = {
    "name": "extract_user_info",
    "description": "Extract user details (name, email, phone, location, age) from a chat message",
    "parameters": {
        "type": "object",
        "properties": {
            "name": {"type": "string", "description": "Full name of the person"},
            "email": {"type": "string", "format": "email"},
            "phone": {"type": "string", "description": "Phone number, international format preferred"},
            "location": {"type": "string", "description": "City / Country / Address"},
            "age": {"type": ["integer", "null"], "minimum": 0, "maximum": 120, "description": "Age in years if available"}
        },
        "required": [],
        "additionalProperties": False
    }
}

validation_schema = extract_function["parameters"]

In [None]:
# JSON extraction
def _get_message_from_response(resp: Any) -> Dict:
    # normalize message extraction
    try:
        choices = getattr(resp, "choices", None)
        if choices and len(choices) > 0:
            first = choices[0]
            if hasattr(first, "message"):
                # attribute-style
                msg = first.message
                # convert to dict
                if hasattr(msg, "content") or hasattr(msg, "function_call"):
                    out = {}
                    out["content"] = getattr(msg, "content", None)
                    fc = getattr(msg, "function_call", None)
                    if fc:
                        out["function_call"] = {"name": getattr(fc, "name", None), "arguments": getattr(fc, "arguments", None)}
                    return out
            # else fallback to dict-like
    except Exception:
        pass

    # dict-like fallback
    if isinstance(resp, dict):
        try:
            return resp["choices"][0]["message"]
        except Exception:
            return {"content": str(resp)}
    return {"content": str(resp)}


def extract_user_info_with_functions(chat_text: str,
                                     model: str = "openai/gpt-oss-20b",
                                     temperature: float = 0.0,
                                     max_tokens: int = 300) -> Dict:
    messages = [
        {"role": "system", "content": "You are an assistant that extracts contact info and returns only a JSON object via function call."},
        {"role": "user", "content": chat_text}
    ]
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=messages,
            functions=[extract_function],
            function_call="auto",
            temperature=temperature,
            max_tokens=max_tokens
        )
    except Exception as e:
        return {"parsed": {}, "valid": False, "errors": f"API call failed: {e}", "raw_choice": None}

    message = _get_message_from_response(resp)
    parsed = {}

    # handle function_call arguments if present
    fc = message.get("function_call") if isinstance(message, dict) else None
    args_str = None
    if fc and fc.get("arguments"):
        args_str = fc["arguments"]
    else:
        args_str = message.get("content")

    if args_str:
        if not isinstance(args_str, str):
            args_str = str(args_str)
        try:
            parsed = json.loads(args_str)
        except Exception:
            # try to extract JSON substring
            m = re.search(r"\{[\s\S]*\}", args_str)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                except Exception:
                    # attempt minor fixes: replace single quotes and remove trailing commas
                    s = m.group(0).replace("'", "\"")
                    s = re.sub(r",(\s*[\}\]])", r"\1", s)
                    try:
                        parsed = json.loads(s)
                    except Exception:
                        parsed = {}
            else:
                parsed = {}
    else:
        parsed = {}

    # Validate against schema
    try:
        validate(instance=parsed, schema=validation_schema, format_checker=FormatChecker())
        valid = True
        errors = None
    except ValidationError as e:
        valid = False
        errors = str(e)

    return {"parsed": parsed, "valid": valid, "errors": errors, "raw_choice": message}


In [None]:
sample_chats = [
    "Hello, I'm Rahul Verma. You can reach me at rahul.verma@example.com or +91 98765 43210. I'm based in Bengaluru and I'm 26 years old.",
    "Hey — name's Tara. Email is tara85@domain.co. No phone here. I'm in Pune.",
    "User1: Hi, I'm John Doe (john.d@example.com). Call me at 555-2368. User2: I'm Elle (elle@somewhere.com)."
]

results = []
for chat in sample_chats:
    r = extract_user_info_with_functions(chat)
    results.append({"chat": chat, **r})

df = pd.DataFrame([{"chat": r["chat"], "parsed": r["parsed"], "valid": r["valid"], "errors": r["errors"]} for r in results])
pd.set_option('display.max_colwidth', 200)
df

Unnamed: 0,chat,parsed,valid,errors
0,"Hello, I'm Rahul Verma. You can reach me at rahul.verma@example.com or +91 98765 43210. I'm based in Bengaluru and I'm 26 years old.","{'age': 26, 'email': 'rahul.verma@example.com', 'location': 'Bengaluru', 'name': 'Rahul Verma', 'phone': '+91 98765 43210'}",True,
1,Hey — name's Tara. Email is tara85@domain.co. No phone here. I'm in Pune.,"{'email': 'tara85@domain.co', 'location': 'Pune', 'name': 'Tara'}",True,
2,"User1: Hi, I'm John Doe (john.d@example.com). Call me at 555-2368. User2: I'm Elle (elle@somewhere.com).",{},True,


In [None]:
import pandas as pd
ground_truth = [
    {"name":"Rahul Verma", "email":"rahul.verma@example.com", "phone":"+919876543210", "location":"Bengaluru", "age":26},
    {"name":"Tara", "email":"tara85@domain.co", "phone": None, "location":"Pune", "age": None},
    {"name":"John Doe", "email":"john.d@example.com", "phone":"555-2368", "location": None, "age": None},
]

def normalize_phone(p):
    if not p: return None
    return re.sub(r"[^\d+]", "", p)

def simple_eval(parsed, truth):
    score = {}
    for k in ["name","email","phone","location","age"]:
        pred = parsed.get(k)
        tr = truth.get(k)
        if k == "phone":
            pred_norm = normalize_phone(pred)
            tr_norm = normalize_phone(tr)
            score[k] = int(bool(pred_norm and tr_norm and pred_norm.endswith(tr_norm[-6:])))  # a loose check
        else:
            score[k] = int(bool(pred and tr and str(pred).lower() == str(tr).lower()))
    return score

scores = []
for r, gt in zip(results, ground_truth):
    s = simple_eval(r["parsed"], gt)
    s["chat"] = r["chat"]
    scores.append(s)

display(pd.DataFrame(scores))


Unnamed: 0,name,email,phone,location,age,chat
0,1,1,1,1,1,"Hello, I'm Rahul Verma. You can reach me at rahul.verma@example.com or +91 98765 43210. I'm based in Bengaluru and I'm 26 years old."
1,1,1,0,1,0,Hey — name's Tara. Email is tara85@domain.co. No phone here. I'm in Pune.
2,0,0,0,0,0,"User1: Hi, I'm John Doe (john.d@example.com). Call me at 555-2368. User2: I'm Elle (elle@somewhere.com)."


In [None]:
# Last Check
ground_truth = [
    {"name":"Rahul Verma", "email":"rahul.verma@example.com", "phone":"+919876543210", "location":"Bengaluru", "age":26},
    {"name":"Tara", "email":"tara85@domain.co", "phone": None, "location":"Pune", "age": None},
    {"name":"John Doe", "email":"john.d@example.com", "phone":"5552368", "location": None, "age": None},
]

def simple_eval(parsed, truth):
    row = {}
    for k in ["name","email","phone","location","age"]:
        pred = parsed.get(k)
        tr = truth.get(k)
        if k == "phone":
            p = normalize_phone(pred)
            t = normalize_phone(tr)
            row[k] = int(bool(p and t and (p.endswith(t[-6:]) if t and len(t)>=6 else p==t)))
        elif tr is None:
            # if truth missing, we don't penalize; mark as N/A (2)
            row[k] = 2
        else:
            row[k] = int(bool(pred and str(pred).strip().lower() == str(tr).strip().lower()))
    return row

scores = []
for r, gt in zip(results, ground_truth):
    scores.append({"chat": r["chat"], **simple_eval(r["parsed"], gt)})

pd.DataFrame(scores)

Unnamed: 0,chat,name,email,phone,location,age
0,"Hello, I'm Rahul Verma. You can reach me at rahul.verma@example.com or +91 98765 43210. I'm based in Bengaluru and I'm 26 years old.",1,1,1,1,1
1,Hey — name's Tara. Email is tara85@domain.co. No phone here. I'm in Pune.,1,1,0,1,2
2,"User1: Hi, I'm John Doe (john.d@example.com). Call me at 555-2368. User2: I'm Elle (elle@somewhere.com).",0,0,0,2,2
