## Qwen2.5 Email Prioritization - Data Labeling

In [None]:
!pip install tqdm pandas requests



In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
!ollama serve

2025/03/28 09:16:52 routes.go:1230: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:2048 OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy:

In [None]:
!ollama pull qwen2.5

In [None]:
!ollama run qwen2.5

[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠇ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l

In [None]:
import pandas as pd
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

from google.colab import drive
drive.mount('/content/drive')

# Modify the file paths to match Google Drive locations
file_path = "/content/drive/MyDrive/FYPDataset/email_batch_2.csv"  # Update this with your input file path on Google Drive
email_data = pd.read_csv(file_path)

email_data.head(10)

# Check if necessary columns exist in the dataset
if "Subject_LLM" not in email_data.columns or "Message_LLM" not in email_data.columns:
    raise ValueError("🚨 ERROR: 'Subject_LLM' or 'Message_LLM' column not found in the dataset. Check the CSV file.")

# Define work-related keywords and urgent indicators
work_keywords = {
    'high_priority': [
        'urgent', 'asap', 'deadline', 'important', 'critical', 'emergency',
        'immediate attention', 'priority', 'action required', 'due today',
        'overdue', 'urgent action', 'immediate response needed', 'time sensitive',
        'urgent meeting', 'board meeting', 'regulatory', 'compliance deadline',
        'immediate action', 'pressing', 'crucial', 'vital'
    ],
    'business_terms': [
        'meeting', 'project', 'client', 'report', 'presentation', 'budget',
        'deadline', 'deliverable', 'stakeholder', 'contract', 'proposal',
        'review', 'approval', 'compliance', 'strategy', 'trading', 'energy',
        'power', 'gas', 'deal', 'transaction', 'market', 'regulatory', 'filing',
        'audit', 'financial', 'board', 'executive', 'partnership', 'agreement',
        'merger', 'revenue', 'profit', 'loss', 'investment', 'stock', 'shares'
    ]
}

def label_email_priority_llama3_3(subject, message, progress_bar):
    """Calls Qwen2.5 API and assigns priority to the email based on subject, message tone, and understanding."""
    prompt = f"""You are analyzing work-related emails to assign a priority level based on the subject, message content, tone, urgency, and business relevance.
    Your task is to evaluate the email's level of urgency and importance using the following factors:

    - **Subject Line**: Does it indicate a time-sensitive matter or an important business issue (e.g., meeting, deadline, urgent request)?
    - **Message Content**: Does the message contain critical work-related information, such as a request for immediate action, a deadline, an important meeting, or a regulatory issue?
    - **Tone**: Is the tone demanding or expressing urgency (e.g., using terms like 'ASAP', 'immediate attention', 'deadline', or 'urgent action')?
    - **Business Relevance**: Does the email pertain to business operations, projects, meetings, client deadlines, compliance, or other critical work activities?

    **Priority Levels (Choose exactly one based on your analysis):**
    - **High**: The email requires immediate attention, involves urgent or critical business issues, or has a time-sensitive deadline.
    - **Medium**: The email contains important information, but it is not immediately time-sensitive. It should be addressed soon but not immediately.
    - **Low**: The email is non-urgent, contains general information, or can be dealt with later.

    **Keywords to consider for urgency and importance**:
    - Urgent terms: 'urgent', 'asap', 'deadline', 'important', 'critical', 'emergency', 'immediate attention', 'priority', 'action required', 'due today', 'overdue', 'urgent action'.
    - Business terms: 'meeting', 'project', 'client', 'report', 'presentation', 'budget', 'contract', 'proposal', 'stakeholder', 'regulatory', 'audit', 'financial', 'transaction', 'market', 'compliance'.

    **Subject**:
    "{subject}"

    **Message Content**:
    "{message}"

    **Priority (return only one from the list above):**
    """

    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                'model': 'qwen2.5',
                'prompt': prompt,
                'stream': False,
                'temperature': 0.1,
                'max_tokens': 10
            },
            timeout=30
        ).json()

        predicted_priority = response.get('response', 'Error').strip()

        valid_priorities = ["High", "Medium", "Low"]
        if predicted_priority not in valid_priorities:
            predicted_priority = "Medium"

        progress_bar.update(1)

        return predicted_priority

    except requests.exceptions.RequestException:
        progress_bar.update(1)
        return "Error"

def process_batch_priority(batch, progress_bar):
    return batch.apply(lambda row: label_email_priority_llama3_3(row['Subject_LLM'], row['Message_LLM'], progress_bar), axis=1)

BATCH_SIZE = 100
NUM_WORKERS = 8

# Processing emails in batches with a progress bar
with tqdm(total=len(email_data), desc="🚀 Processing Emails") as progress_bar:
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        results = list(executor.map(
            lambda batch: process_batch_priority(batch, progress_bar),
            [email_data.iloc[i:i + BATCH_SIZE] for i in range(0, len(email_data), BATCH_SIZE)]
        ))

email_data["Email_Priority"] = [priority for batch in results for priority in batch]

output_file = "/content/drive/MyDrive/FYPDataset/email_batch_2_priority_qwen2.5.csv"
email_data.to_csv(output_file, index=False)

print(email_data["Email_Priority"].value_counts())

print(f"✅ Processed {len(email_data)} emails.")
print(f"✅ Processed file saved at: {output_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


🚀 Processing Emails: 100%|██████████| 20000/20000 [3:11:38<00:00,  1.74it/s]


Email_Priority
Medium    19802
Low         131
High         67
Name: count, dtype: int64
✅ Processed 20000 emails.
✅ Processed file saved at: /content/drive/MyDrive/FYPDataset/email_batch_2_priority_qwen2.5.csv
