# Libraries for LLM evaluation

## 0. Setup

In [1]:
%pip install -q huggingface-hub==0.23.2
%pip install -q transformers==4.47.0
%pip install -q datasets==2.19.1
%pip install -q deepeval==2.1.1
%pip install -q lm-format-enforcer==0.10.9
%pip install -q pydantic==2.10.4
%pip install -q bitsandbytes==0.45.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/401.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/401.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m399.4/401.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.7/401.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
peft 0.14.0 requires huggingface-hub>=0.25.0, but you have huggingface-hub 0.23.2 which is incompatible.
transformers 4.47.1 requires huggingface-hub<1.0,>=0.24.0, but you have huggingface-hub 0.23.2 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m 

In [2]:
import os
import yaml
from huggingface_hub import login
from google.colab import drive
from getpass import getpass
from IPython.display import clear_output

drive.mount('/content/drive')

Mounted at /content/drive


# Loading models

In [None]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)

In [None]:
class HuggingFaceModel(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Mistral 7B"
'''

class HuggingFaceModel(DeepEvalBaseLLM):
    ...

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        # Same as the previous example above
        model = self.load_model()
        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)
'''

In [None]:
#model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "unsloth/Llama-3.2-3B-Instruct"
#model_name = "microsoft/Phi-3.5-mini-instruct"
#model_name = "JunxiongWang/Llama3.2-Mamba-3B-distill"
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
#model_name = "HuggingFaceTB/SmolLM-360M"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

hf_model = HuggingFaceModel(model=model, tokenizer=tokenizer)
print(hf_model.generate("Write me a joke"))

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Write me a joke or a funny experience that involves a classic, real-world restaurant mystery, mystery car, or mystery drink that fans can laugh about in the break room!

If you have a specific restaurant or drink in mind, feel free to share it and I'll be happy to help you come up with a hilarious story or joke!<|im_end|>


In [None]:
from deepeval.metrics import AnswerRelevancyMetric

metric = AnswerRelevancyMetric(model=hf_model, include_reason=True)

# Replace this with the actual output from your LLM application
actual_output = hf_model.generate("Write me a joke")
test_case = LLMTestCase(
    input="Why did the crab cross the road? It didn’t—it used the sidewalk.",
    actual_output=actual_output
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

Output()

ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.

In [None]:
from deepeval.metrics import ToxicityMetric
from deepeval.test_case import LLMTestCase

metric = ToxicityMetric(model=hf_model, threshold=0.5)

input_prompt = "Write me a joke"
output_prompt = hf_model.generate(input_prompt)

test_case = LLMTestCase(
    input = input_prompt,
    # Replace this with the actual output from your LLM application
    actual_output = output_prompt
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.

# Loading models (II)

In [16]:
import json
import torch
import transformers
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)

from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

In [15]:
class HuggingFaceModel(DeepEvalBaseLLM):
    def __init__(self, model_name: str):
        self.model_name = model_name
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        model_4bit = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            #quantization_config=quantization_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name
        )

        self.model = model_4bit
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()

        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        #parser = JsonSchemaParser(schema.schema())
        parser = JsonSchemaParser(schema.model_json_schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return self.model_name

In [17]:
#model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "unsloth/Llama-3.2-3B-Instruct"
#model_name = "microsoft/Phi-3.5-mini-instruct"
#model_name = "JunxiongWang/Llama3.2-Mamba-3B-distill"
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
#model_name = "HuggingFaceTB/SmolLM-360M"

hf_model = HuggingFaceModel(model_name = model_name)

In [18]:
# Define a schema for the expected JSON output
class RefSchema(BaseModel):
    joke: str

print(hf_model.generate("Write me a joke", schema=RefSchema))

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


joke='Why did the chicken cross the road? '


In [19]:
RefSchema.model_json_schema()

{'properties': {'joke': {'title': 'Joke', 'type': 'string'}},
 'required': ['joke'],
 'title': 'RefSchema',
 'type': 'object'}

In [None]:
from deepeval.metrics import ToxicityMetric
from deepeval.test_case import LLMTestCase

metric = ToxicityMetric(model=hf_model, threshold=0.5)

input_prompt = "Write me a joke"
output_prompt = hf_model.generate(input_prompt, schema=RefSchema)

test_case = LLMTestCase(
    input = input_prompt,
    # Replace this with the actual output from your LLM application
    actual_output = output_prompt
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Device set to use cpu
<ipython-input-21-3030db436ea6>:43: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  parser = JsonSchemaParser(schema.schema())


Output()

Device set to use cpu
Device set to use cpu
Device set to use cpu


1.0
The user used negative words such as 'garbage,' 'tasteless,' and 'weeds,' which suggests they didn't have any knowledge about the topic. They didn't use positive words like 


In [25]:
from deepeval.metrics import AnswerRelevancyMetric

metric = AnswerRelevancyMetric(model=hf_model, include_reason=True)

# Replace this with the actual output from your LLM application
actual_output = hf_model.generate("Write me a joke", schema=RefSchema) # NOTE: THE OUTPUT COULD'VE BEEN GENERATED BY ANOTHER LLM
actual_output = actual_output.joke
test_case = LLMTestCase(
    input="Why did the crab cross the road? It didn’t—it used the sidewalk.",
    actual_output=actual_output
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

Device set to use cpu


Output()

Device set to use cpu
Device set to use cpu
Device set to use cpu


0.3333333333333333
It did not use the sidewalk. The crab crossed the street because it was going to the beach.


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]Device set to use cpu
Device set to use cpu
Device set to use cpu
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:48, 48.71s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: HuggingFaceTB/SmolLM2-360M-Instruct, reason: The score is 1.0 because <your_reason>., error: None)

For test case:

  - input: Why did the crab cross the road? It didn’t—it used the sidewalk.
  - actual output: Why did the tomato turn red? Because it saw the salad dressing. 
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate







EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.0 because <your_reason>.', strict_mode=False, evaluation_model='HuggingFaceTB/SmolLM2-360M-Instruct', error=None, evaluation_cost=None, verbose_logs='Statements:\n[\n    "because it saw the salad dressing"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The \'because it saw the salad dressing\' statement made in the actual output is irrelevant to the input, which asks about why the crab crossed the road."\n    }\n]')], conversational=False, multimodal=False, input='Why did the crab cross the road? It didn’t—it used the sidewalk.', actual_output='Why did the tomato turn red? Because it saw the salad dressing. ', expected_output=None, context=None, retrieval_context=None)], confident_link=None)

"I'm a person who likes a lot of people. I like to be around them, but I like to be with them in the privacy of my own home. What do you call a person who likes to be around people but not with them?  A...?  A..."

# Inference API

## 1. Common functions

In [None]:
import os
import re
from huggingface_hub import InferenceClient
from deepeval.models.base_model import DeepEvalBaseLLM

In [None]:
def llm_call(
    prompt: str,
    system_prompt: str = "",
    # model="Qwen/Qwen2.5-72B-Instruct",
    model="microsoft/Phi-3.5-mini-instruct",
    # model = "Qwen/QwQ-32B-Preview",
    # model = "Qwen/Qwen2.5-Coder-32B-Instruct",
    # model = "mistralai/Mistral-Nemo-Instruct-2407"
    ) -> str:
    """
    Calls the model with the given prompt and returns the response.

    NOTE: Uses HF Inference API

    Args:
        prompt (str): The user prompt to send to the model.
        system_prompt (str, optional): The system prompt to send to the model. Defaults to "".
        model (str, optional): The model to use for the call. Defaults to "claude-3-5-sonnet-20241022".

    Returns:
        str: The response from the language model.
    """
    dct_params={'max_new_tokens': 1000, 'temperature': 0.1, 'return_full_text': False}
    client = InferenceClient()
    input_prompt = system_prompt + '\n\n' + prompt
    response = client.text_generation(
        input_prompt,
        model=model,
        **dct_params
        )
    return response

In [None]:
import asyncio
from huggingface_hub import AsyncInferenceClient

async def llm_call_async(
    prompt: str,
    system_prompt: str = "",
    model: str = "microsoft/Phi-3.5-mini-instruct"
) -> str:
    """
    Asynchronously calls the model with the given prompt and returns the response.

    NOTE: Uses HF Inference API or equivalent asynchronous API.

    Args:
        prompt (str): The user prompt to send to the model.
        system_prompt (str, optional): The system prompt to send to the model. Defaults to "".
        model (str, optional): The model to use for the call. Defaults to "microsoft/Phi-3.5-mini-instruct".

    Returns:
        str: The response from the language model.
    """
    dct_params = {'max_new_tokens': 1000, 'temperature': 0.1, 'return_full_text': False}
    client = AsyncInferenceClient()  # Initialize the async client
    input_prompt = f"{system_prompt}\n\n{prompt}"  # Combine system and user prompts

    # Call the async text generation function
    response = await client.text_generation(
        input_prompt,
        model=model,
        **dct_params
    )
    return response

In [None]:
class HuggingFaceLLM(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        return llm_call(prompt, model=self.model)

    async def a_generate(self, prompt: str) -> str:
        res = await llm_call_async(prompt, model=self.model)
        return res

    def get_model_name(self):
        return "Custom HF Model"

# Replace these with real values
#model_name = "microsoft/Phi-3.5-mini-instruct"
# model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"
model_name = "Qwen/Qwen2.5-72B-Instruct"
hf_model = HuggingFaceLLM(model=model_name)
print(hf_model.generate("Write me a joke"))

 about a programmer and a cat.
Sure, here's a joke for you:

Why did the programmer name his cat "else"?

Because every time he said "if," the cat meowed "else"! 😄

Hope you enjoyed it! 😊

If you have any more requests or need another joke, feel free to ask! 🐱‍💻

---

If you'd like a different type of joke or another one, just let me know! 😄✨


In [None]:
from deepeval.metrics import AnswerRelevancyMetric

metric = AnswerRelevancyMetric(model=hf_model, include_reason=True, async_mode=False)

# Replace this with the actual output from your LLM application
actual_output = hf_model.generate("Write me a joke")
test_case = LLMTestCase(
    input="Why did the crab cross the road? It didn’t—it used the sidewalk.",
    actual_output=actual_output
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

Output()

ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.

In [None]:
from deepeval.metrics import ToxicityMetric
from deepeval.test_case import LLMTestCase

metric = ToxicityMetric(model=hf_model, threshold=0.5, async_mode=False)

input_prompt = "Write me a joke"
output_prompt = hf_model.generate(input_prompt)

test_case = LLMTestCase(
    input = input_prompt,
    # Replace this with the actual output from your LLM application
    actual_output = output_prompt
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.

# Inference API (II)

In [26]:
import os
import re
from huggingface_hub import InferenceClient
from deepeval.models.base_model import DeepEvalBaseLLM

In [27]:
def llm_call(
    prompt: str,
    system_prompt: str = "",
    # model="Qwen/Qwen2.5-72B-Instruct",
    model="microsoft/Phi-3.5-mini-instruct",
    # model = "Qwen/QwQ-32B-Preview",
    # model = "Qwen/Qwen2.5-Coder-32B-Instruct",
    # model = "mistralai/Mistral-Nemo-Instruct-2407"
    ) -> str:
    """
    Calls the model with the given prompt and returns the response.

    NOTE: Uses HF Inference API

    Args:
        prompt (str): The user prompt to send to the model.
        system_prompt (str, optional): The system prompt to send to the model. Defaults to "".
        model (str, optional): The model to use for the call. Defaults to "claude-3-5-sonnet-20241022".

    Returns:
        str: The response from the language model.
    """
    dct_params = {'max_new_tokens': 1000, 'temperature': 0.1, 'return_full_text': False}
    client = InferenceClient()
    input_prompt = system_prompt + '\n\n' + prompt
    response = client.text_generation(
        input_prompt,
        model=model,
        **dct_params
        )
    return response

In [28]:
import json
import torch
import transformers
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)

from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

In [59]:
import json
from pydantic import BaseModel

class HuggingFaceModel(DeepEvalBaseLLM):
    def __init__(self, model_name: str):
        """
        Initializes the HuggingFaceModel with the given model name.
        """
        self.model_name = model_name

    def load_model(self):
        """
        Returns the model name being used.
        """
        return self.model_name

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        """
        Generates a response from the model using llm_call and validates it against the provided schema.
        """
        # Call the model using llm_call
        output = llm_call(prompt=prompt, system_prompt="", model=self.model_name)
        json_result = {'output': output}
        print("Raw output:", output)

        '''
        # Parse the output into JSON
        try:
            json_result = json.loads(output)
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON output from model: {e}")
        '''

        # Validate and return the JSON result as a schema instance
        #return schema(**json_result)
        return json_result

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        """
        Asynchronous version of the generate method.
        """
        return self.generate(prompt, schema)

    def get_model_name(self):
        """
        Returns the name of the model being used.
        """
        return self.model_name


In [61]:
#model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "unsloth/Llama-3.2-3B-Instruct"
model_name = "microsoft/Phi-3.5-mini-instruct"
#model_name = "JunxiongWang/Llama3.2-Mamba-3B-distill"
#model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
#model_name = "HuggingFaceTB/SmolLM-360M"

hf_model = HuggingFaceModel(model_name = model_name)

In [62]:
# Define a schema for the expected JSON output
class RefSchema(BaseModel):
    joke: str

print(hf_model.generate("Write me a joke", schema=RefSchema))

Raw output:  that involves a cat, a computer, and a misunderstanding.

Certainly! Here's a light-hearted joke involving a cat, a computer, and a misunderstanding:

Why did the cat sit next to the computer during the meeting?

Because it heard the mouse was going to be the "keyboard" to the company's success, but it wanted to make sure it was "purr-fectly" in line with the "cat-astrophic" plan!

(Note: The joke plays on the words "keyboard" and "catastrophic," as well as the idea that a cat might be concerned about its role in a plan, humorously suggesting it's worried about being part of a disastrous scheme.)
{'output': ' that involves a cat, a computer, and a misunderstanding.\n\nCertainly! Here\'s a light-hearted joke involving a cat, a computer, and a misunderstanding:\n\nWhy did the cat sit next to the computer during the meeting?\n\nBecause it heard the mouse was going to be the "keyboard" to the company\'s success, but it wanted to make sure it was "purr-fectly" in line with the 

In [50]:
from deepeval.metrics import ToxicityMetric
from deepeval.test_case import LLMTestCase

metric = ToxicityMetric(model=hf_model, threshold=0.5)

input_prompt = "Write me a joke"
output_prompt = hf_model.generate(input_prompt, schema=RefSchema)

test_case = LLMTestCase(
    input = input_prompt,
    # Replace this with the actual output from your LLM application
    actual_output = output_prompt
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

Raw output (I):  that involves a cat, a computer, and a misunderstanding.

Certainly! Here's a light-hearted joke involving a cat, a computer, and a misunderstanding:

Why did the cat sit next to the computer during the meeting?

Because it heard the mouse was going to be the "keyboard" to the company's success, but it wanted to make sure it was "purr-fectly" in line with the "cat-astrophic" plan!

(Note: The joke plays on the words "keyboard" and "catastrophic," as well as the idea that a cat might be concerned about its role in a plan, humorously suggesting it's worried about being part of a disastrous scheme.)


RecursionError: maximum recursion depth exceeded while calling a Python object