In [2]:
import torch
from transformers import AutoTokenizer, Gemma3ForCausalLM

ckpt = "google/gemma-3-1b-it"
model = Gemma3ForCausalLM.from_pretrained(
    ckpt, torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(ckpt)

In [81]:
ocr_text = "강남전당 클래시아 1차생활투자골돈크로스 special 당신의 첫번째 성공투자 단350명 선/착/순 2018년 OP타N여정 분양문의 02-1234-5678 010-1234-5678 드미리건설"
no_info = "정보를 찾을 수 없습니다."

In [82]:
messages = [
    [
        {
            "role": "system",
            "content":[
                {
                    "type": "text",
                    "text": "You are an expert in analyzing the purpose of banners. Your task is to classify the given text into the most relevant category and determine its legality based on strict criteria."
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"""
                                Analyze the meaning of the following banner text and classify it into the most relevant category from the list below.

                                **Banner Text:** {ocr_text}

                                ### Categories:
                                1. Politics
                                2. Public interest
                                3. Weddings and funerals
                                4. School
                                5. Religious ceremonies
                                6. Assembly
                                7. Guidance
                                8. Disaster
                                9. Commercial purposes
                                10. Other

                                ### Judgment Criteria:
                                - If the category is **Politics, Public interest, Weddings and funerals, School, Religious ceremonies, Assembly, Guidance, or Disaster**, then:
                                  **Judgment: "legal"**
                                
                                - If the category is **Commercial purposes or Other**, then:
                                  **Judgment: "illegal"**
                                
                                ### Strict Classification Rules:
                                - If the text includes **business, real estate, or advertisement-related words** (e.g., "rent", "sale", "discount", "contract", "move-in", "real estate", "shopping mall", "office space", "inquiry"), classify it as **Commercial purposes**.
                                - If the text contains a **phone number**, it **must** be classified as **Commercial purposes** and marked as **illegal**.
                                - If the text is unclear, classify it as **Other** and mark it as **illegal**.

                                ### Output Format (Strictly Follow This):
                                ```
                                Category: "category1"
                                Judgment: "legal" or "illegal"
                                ```

                                - **Example (Legal):**
                                ```
                                Category: "Politics"
                                Judgment: "legal"
                                ```

                                - **Example (Illegal):**
                                ```
                                Category: "Commercial purposes"
                                Judgment: "illegal"
                            """
                            
                }
            ]
            
        }
    ],
]

inputs = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
).to(model.device)

input_len = inputs["input_ids"].shape[-1]

generation = model.generate(**inputs, max_new_tokens=200, do_sample=False)
generation = generation[0][input_len:]

decoded = tokenizer.decode(generation, skip_sepcial_tokens=True)
print(decoded)

Okay, let's analyze the banner text and classify it.

**Category: Commercial purposes**

**Judgment: illegal**

**Reasoning:**

The text explicitly mentions “분양문의” (ban-yang mo-un), which translates to “sales inquiry” or “pre-sale inquiries.”  It’s a direct advertisement for a property being sold.  The inclusion of “02-1234-5678” and “010-1234-5678” are phone numbers, clearly intended to generate leads and facilitate sales.  The entire banner is designed to promote a real estate transaction, making it undeniably a commercial purpose.
<end_of_turn>


In [83]:
messages = [
    [
        {
            "role": "system",
            "content":[
                {
                    "type": "text",
                    "text": "You are a helpful assistant who extract phone numbers and company name from text extracted from banners."
                }
            ]
        },
        {
            "role": "user",
            "content":[
                {
                    "type": "text",
                    "text": f"""
                                Extract all phone numbers and a company name from {ocr_text}. If there are none, answer {no_info}.
                                The output is printed as an example.
                                example)
                                Phone Number: 02-1234-2456, 010-4356-4562
                                Company: 가나다 회사
                                
                            """
                }
            ]
        },
    ],
]

inputs = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
).to(model.device)

input_len = inputs["input_ids"].shape[-1]

generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
generation = generation[0][input_len:]

decoded = tokenizer.decode(generation, skip_sepcial_tokens=True)
print(decoded)

Phone Number: 02-1234-5678
Company: 드미리건설<end_of_turn>
