In [None]:
# Cell 1: Setup Kaggle API, Mount Drive, and Install Libraries

# Install libraries
!pip install -q kaggle transformers[torch] pandas scikit-learn gradio torch sentencepiece # sentencepiece for tokenizer

import os
from google.colab import files
from google.colab import drive
import shutil
import subprocess

# 1. Mount Google Drive
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"ERROR: Could not mount Google Drive: {e}")
    raise SystemExit("Drive mount failed.")

# 2. Configure Kaggle API
KAGGLE_CONFIG_DIR = os.path.expanduser("~/.kaggle")
KAGGLE_JSON_TARGET_PATH = os.path.join(KAGGLE_CONFIG_DIR, "kaggle.json")
print(f"Target Kaggle config path: {KAGGLE_JSON_TARGET_PATH}")
if os.path.exists("kaggle.json"): os.remove("kaggle.json") # Clean up previous session uploads
if os.path.exists(KAGGLE_JSON_TARGET_PATH): os.remove(KAGGLE_JSON_TARGET_PATH)
print("\nACTION REQUIRED: Please upload your 'kaggle.json' file now.")
uploaded_files = files.upload()
if 'kaggle.json' in uploaded_files:
    print("\n'kaggle.json' uploaded to Colab session root successfully!")
    os.makedirs(KAGGLE_CONFIG_DIR, exist_ok=True)
    try:
        shutil.move('kaggle.json', KAGGLE_JSON_TARGET_PATH)
        print(f"'kaggle.json' moved to {KAGGLE_JSON_TARGET_PATH}")
    except Exception as e:
        print(f"ERROR moving 'kaggle.json': {e}"); raise SystemExit("Kaggle.json move failed.")
    if os.path.exists(KAGGLE_JSON_TARGET_PATH):
        os.chmod(KAGGLE_JSON_TARGET_PATH, 0o600)
        print(f"Permissions set for {KAGGLE_JSON_TARGET_PATH}")
        print("\nVerifying Kaggle API access...")
        try:
            result = subprocess.run(['kaggle', 'competitions', 'list', '-p', '1', '--csv'], capture_output=True, text=True, check=True)
            print("Kaggle API Verified! Output (first 5 lines):"); print('\n'.join(result.stdout.splitlines()[:5]))
        except Exception as e:
            print(f"ERROR: Kaggle API verification failed: {e}"); raise SystemExit("Kaggle API verification failed.")
    else:
        print(f"ERROR: {KAGGLE_JSON_TARGET_PATH} does not exist after move."); raise SystemExit("Kaggle.json placement failed.")
else:
    print("ERROR: 'kaggle.json' not in uploaded files."); raise SystemExit("Kaggle.json not uploaded.")
print("\nCell 1: Setup Complete. Proceed to Cell 2.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.2/54.2 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.1/323.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Saving kaggle.json to kaggle.json

'kaggle.json' uploaded to Colab session root successfully!
'kaggle.json' moved to /root/.kaggle/kaggle.json
Permissions set for /root/.kaggle/kaggle.json

Verifying Kaggle API access...
Kaggle API Verified! Output (first 5 lines):
ref,deadline,category,reward,teamCount,userHasEntered
https://www.kaggle.com/competitions/arc-prize-2025,2025-11-03 23:59:00,Featured,"1,000,000 Usd",407,False
https://www.kaggle.com/competitions/openai-to-z-challenge,2025-06-29 23:59:00,Featured,"400,000 Usd",0,False
https://www.kaggle.com/competitions/stanford-rna-3d-folding,2025-05-29 23:59:00,Featured,"75,000 Usd",1447,False
https://www.kaggle.com/competitions/byu-locating-bacterial-flagellar-motors-2025,2025-06-04 23:59:00,Research,"65,000 Usd",1070,False

Cell 1: Setup Complete. Proceed to Cell 2.


In [None]:
# ==============================================================================
# SentimentAnalysisAgent for Customer Reviews - Main Application Code (REVISED FOR CLARITY)
# ==============================================================================
import gradio as gr
import pandas as pd
import numpy as np
import os
import random
import re
from typing import List, Dict, Tuple
from enum import Enum
import zipfile # Though this dataset might not be zipped by default via API

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.cuda.amp import GradScaler, autocast # For Mixed Precision Training

# --- Configuration ---
MODEL_NAME = "distilbert-base-uncased"

DRIVE_BASE_PATH = "/content/drive/My Drive/"
MODEL_FOLDER_NAME = "Colab_Models/SentimentAnalysisAgent_Airline_Clear" # New folder
MODEL_SAVE_PATH = os.path.join(DRIVE_BASE_PATH, MODEL_FOLDER_NAME)
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

KAGGLE_DATASET_SLUG = "crowdflower/twitter-airline-sentiment"
KAGGLE_DOWNLOAD_PATH = "/content/kaggle_data/airline_sentiment_clear" # New download path
os.makedirs(KAGGLE_DOWNLOAD_PATH, exist_ok=True)

# Sentiment labels for this dataset
LABEL_TO_ID = {"negative": 0, "neutral": 1, "positive": 2}
ID_TO_LABEL = {v: k for k, v in LABEL_TO_ID.items()}
NUM_LABELS = len(LABEL_TO_ID)

MAX_LEN = 128 # Tweets are relatively short
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5 # Common fine-tuning LR
RANDOM_SEED = 42
USE_AMP = torch.cuda.is_available() # Enable Mixed Precision if GPU is available

# --- Setup Random Seeds ---
random.seed(RANDOM_SEED); np.random.seed(RANDOM_SEED); torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(RANDOM_SEED)

# ========================
# --- Dataset Class (Remains the same) ---
# ========================
class ReviewDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], tokenizer, max_len: int):
        self.texts = texts; self.labels = labels; self.tokenizer = tokenizer; self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, item_idx):
        text = str(self.texts[item_idx]); label = self.labels[item_idx]
        encoding = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_attention_mask=True,
            return_token_type_ids=False, return_tensors='pt')
        return {'text': text, 'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)}

# ========================
# --- Helper Function for Data Processing ---
# ========================
def preprocess_review_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r'@\w+', '', text)  # Remove mentions like @airline
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'#', '', text)  # Remove hashtag symbol, keep the word
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text

def load_and_process_airline_tweets(csv_file_path: str, label_to_id_map: Dict[str, int], max_samples=None) -> Tuple[List[str], List[int]]:
    """Loads Tweets.csv, preprocesses, and maps labels."""
    try:
        df = pd.read_csv(csv_file_path, encoding='latin1') # This dataset often needs latin1
        print(f"Successfully loaded {csv_file_path}. Shape: {df.shape}")
    except Exception as e:
        print(f"🚨 Error reading CSV file {csv_file_path}: {e}")
        return [], []

    # Select relevant columns and drop rows with missing crucial data
    df = df[['airline_sentiment', 'text']].copy()
    df.dropna(subset=['airline_sentiment', 'text'], inplace=True)

    # Filter for the sentiments we are interested in (negative, neutral, positive)
    df = df[df['airline_sentiment'].isin(label_to_id_map.keys())]

    df['processed_text'] = df['text'].apply(preprocess_review_text)
    df = df[df['processed_text'].str.strip() != ''] # Remove rows with empty text after processing
    df['label'] = df['airline_sentiment'].map(label_to_id_map)

    if max_samples and max_samples < len(df):
        print(f"Sampling {max_samples} rows from available {len(df)} rows.")
        df = df.sample(n=max_samples, random_state=RANDOM_SEED)

    texts = df['processed_text'].tolist()
    labels = df['label'].tolist()

    print(f"Processed data: {len(texts)} samples. Unique labels found: {set(labels)}")
    if len(set(labels)) < len(label_to_id_map):
        print(f"⚠️ Warning: Not all expected labels ({len(label_to_id_map)}) were found in the processed data. Found: {len(set(labels))}. This might affect training if some classes are missing.")

    return texts, labels

# ========================
# --- AI Agents ---
# ========================
class DataIngestionAgent:
    def __init__(self, download_path: str, dataset_slug: str):
        self.download_path = download_path
        self.dataset_slug = dataset_slug
        self.csv_file_path = os.path.join(self.download_path, "Tweets.csv") # Main data file for this dataset

    def ensure_data_downloaded(self) -> bool:
        """Downloads data from Kaggle if not already present. Returns True if successful/present."""
        if os.path.exists(self.csv_file_path):
            print(f"Dataset file {self.csv_file_path} already exists. Skipping download.")
            return True

        print(f"Downloading Kaggle dataset '{self.dataset_slug}' to '{self.download_path}'...")
        try:
            import subprocess
            # This dataset is simple, usually just --unzip works directly.
            kaggle_command_list = ['kaggle', 'datasets', 'download', '-d', self.dataset_slug,
                                   '-p', self.download_path, '--unzip', '--force']
            print(f"Executing: {' '.join(kaggle_command_list)}")
            process = subprocess.run(kaggle_command_list, capture_output=True, text=True, check=True) # check=True will raise error on failure

            if not os.path.exists(self.csv_file_path):
                print(f"🚨 Expected file {self.csv_file_path} not found after download and unzip.")
                print(f"   Files in download path: {os.listdir(self.download_path)}")
                return False
            print("Kaggle dataset downloaded and unzipped successfully.")
            return True
        except subprocess.CalledProcessError as e:
            print(f"🚨 Kaggle download command failed (code {e.returncode}).")
            print(f"   Stdout: {e.stdout.strip()}")
            print(f"   Stderr: {e.stderr.strip()}")
            return False
        except Exception as e:
            print(f"🚨 An error occurred during Kaggle dataset download/unzip: {e}")
            return False

class SentimentModelAgent: # (No major changes needed from your previous SentimentAgent)
    def __init__(self, model_name: str, model_save_path: str, num_labels: int, id_to_label: Dict[int, str], label_to_id: Dict[str, str]):
        self.model_name, self.model_save_path, self.num_labels = model_name, model_save_path, num_labels
        self.id_to_label, self.label_to_id = id_to_label, label_to_id
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(f"🤖 SentimentModelAgent on: {self.device}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = None; self.is_trained_custom = False
        self.scaler = GradScaler(enabled=USE_AMP)
        if os.path.exists(os.path.join(self.model_save_path, "pytorch_model.bin")): self.load_model()
        else: self._initialize_new_model()

    def _initialize_new_model(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_labels,
                                                                      id2label=self.id_to_label, label2id=self.label_to_id).to(self.device)
        self.is_trained_custom = False; print("Initialized new base sentiment model.")
    def load_model(self):
        try:
            print(f"Loading sentiment model from {self.model_save_path}..."); self.model = AutoModelForSequenceClassification.from_pretrained(self.model_save_path).to(self.device)
            self.is_trained_custom = True; print(f"✅ Custom sentiment model loaded.")
        except Exception as e: print(f"🚨 Failed to load sentiment model: {e}. Init new."); self._initialize_new_model()
    def save_model(self):
        if self.model and self.is_trained_custom:
            os.makedirs(self.model_save_path, exist_ok=True); self.model.save_pretrained(self.model_save_path)
            print(f"💾 Sentiment model saved to {self.model_save_path}")
    def _create_data_loader(self, texts, labels, batch_size, is_train=True):
        return DataLoader(ReviewDataset(texts, labels, self.tokenizer, MAX_LEN), batch_size=batch_size, shuffle=is_train, num_workers=2, pin_memory=USE_AMP)
    def train_model(self, train_texts, train_labels, val_texts, val_labels):
        # This method expects ALREADY PREPROCESSED texts and mapped labels
        if not self.model or self.model.config.num_labels != self.num_labels : self._initialize_new_model()
        train_dl, val_dl = self._create_data_loader(train_texts, train_labels, TRAIN_BATCH_SIZE, True), self._create_data_loader(val_texts, val_labels, VALID_BATCH_SIZE, False)
        opt = AdamW(self.model.parameters(), lr=LEARNING_RATE); total_steps = len(train_dl) * EPOCHS
        sched = get_linear_schedule_with_warmup(opt, num_warmup_steps=max(1,int(total_steps*0.1)), num_training_steps=total_steps)
        print(f"🚀 Training sentiment model for {EPOCHS} epochs. AMP: {USE_AMP}")
        for epoch in range(EPOCHS):
            self.model.train(); total_loss = 0.0
            for idx, batch in enumerate(train_dl):
                ids,mask,labs = batch['input_ids'].to(self.device), batch['attention_mask'].to(self.device), batch['labels'].to(self.device)
                opt.zero_grad()
                with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask,labels=labs); loss = outputs.loss
                self.scaler.scale(loss).backward(); self.scaler.step(opt); self.scaler.update(); sched.step()
                total_loss += loss.item()
                if idx > 0 and idx % (max(1, len(train_dl) // 10)) == 0: print(f"  E{epoch+1} B{idx}/{len(train_dl)} L:{loss.item():.4f}") # Log 10x per epoch
            avg_loss = total_loss/len(train_dl) if train_dl else 0; print(f"\n  E{epoch+1} AvgTrainL:{avg_loss:.4f}")
            self.model.eval(); all_preds, all_true = [], []
            with torch.no_grad():
                for batch in val_dl:
                    ids,mask,labs = batch['input_ids'].to(self.device), batch['attention_mask'].to(self.device), batch['labels'].to(self.device)
                    with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask)
                    all_preds.extend(torch.argmax(outputs.logits,dim=1).cpu().numpy()); all_true.extend(labs.cpu().numpy())
            print(f"--- Validation Report E{epoch+1} ---")
            if all_true and all_preds: print(classification_report(all_true, all_preds, target_names=list(self.id_to_label.values()), zero_division=0))
            else: print("Not enough data for validation report.")
            print("--- End Report ---")
        self.is_trained_custom = True; self.save_model(); print("✅ Sentiment training done.")
    def predict_sentiment(self, preprocessed_text: str) -> Tuple[str, float]: # Expects preprocessed text
        if not self.model: return "ERR: Model missing", 0.0
        self.model.eval(); enc = self.tokenizer.encode_plus(preprocessed_text,add_special_tokens=True,max_length=MAX_LEN,padding='max_length',truncation=True,return_attention_mask=True,return_tensors='pt')
        ids,mask = enc['input_ids'].to(self.device), enc['attention_mask'].to(self.device)
        with torch.no_grad(), autocast(enabled=USE_AMP): logits = self.model(ids,attention_mask=mask).logits
        probs = torch.softmax(logits, dim=1).cpu().float().numpy()[0]
        pred_id = np.argmax(probs); conf = probs[pred_id]
        return self.id_to_label.get(pred_id, "UNKNOWN"), float(conf)

class ResponseSuggestionAgent: # (Same as before)
    def suggest_follow_up(self, sentiment: str, confidence: float) -> str:
        if sentiment == "positive": return "Great to hear! Consider sending a thank you note."
        elif sentiment == "negative":
            return "This needs attention. Escalate to support for follow-up." if confidence > 0.7 else "Acknowledge and investigate further."
        elif sentiment == "neutral": return "Feedback noted. Monitor for trends."
        return "No specific follow-up suggestion."

class CoordinatorAgent:
    def __init__(self):
        self.data_ingestion_agent = DataIngestionAgent(KAGGLE_DOWNLOAD_PATH, KAGGLE_DATASET_SLUG)
        self.model_agent = SentimentModelAgent(MODEL_NAME, MODEL_SAVE_PATH, NUM_LABELS, ID_TO_LABEL, LABEL_TO_ID)
        self.response_agent = ResponseSuggestionAgent()
        if not self.model_agent.is_trained_custom:
            print("Coordinator: Sentiment model not trained. Initiating training...")
            self.full_data_pipeline_and_train(max_samples=None) # Use all samples from CSV by default
        else: print("Coordinator: Custom sentiment model loaded.")

    def full_data_pipeline_and_train(self, max_samples=None):
        """Coordinates data download, processing, and model training."""
        print(f"Coordinator: Starting full data pipeline (max_samples={max_samples})...")
        if not self.data_ingestion_agent.ensure_data_downloaded():
            msg = "🚨 COORDINATOR: Failed to download data. Training halted."
            print(msg); return msg

        # Load and process data using the helper function
        texts, labels = load_and_process_airline_tweets(
            self.data_ingestion_agent.csv_file_path, # Use the path from data_ingestion_agent
            LABEL_TO_ID,
            max_samples=max_samples
        )
        if not texts:
            msg = "🚨 COORDINATOR: Failed to load and process data. Training halted."
            print(msg); return msg

        # Split data
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            texts, labels, test_size=0.15, random_state=RANDOM_SEED, stratify=labels
        )
        if not train_texts or len(set(train_labels)) < NUM_LABELS:
             msg = f"🚨 COORDINATOR: Insufficient data or missing classes after split for training. Needed {NUM_LABELS}, got {len(set(train_labels))}. Training halted."
             print(msg); return msg

        self.model_agent.train_model(train_texts, train_labels, val_texts, val_labels)
        return "Sentiment model training complete."

    def process_review(self, review_text: str) -> Tuple[str, str]:
        if not self.model_agent or not self.model_agent.is_trained_custom: return "Model not ready.", ""
        preprocessed_text = preprocess_review_text(review_text) # Use the global helper
        sentiment, confidence = self.model_agent.predict_sentiment(preprocessed_text)
        suggestion = self.response_agent.suggest_follow_up(sentiment, confidence)
        return f"Sentiment: {sentiment.upper()} (Confidence: {confidence:.2f})", suggestion

    def get_model_status(self) -> str:
        # (Same as before)
        if not self.model_agent: return "Model agent N/A."
        status = "Trained" if self.model_agent.is_trained_custom else "Base (Not Trained)"
        return f"Model: {MODEL_NAME}\nType: Sentiment Analysis\nStatus: {status}\nSave: {MODEL_SAVE_PATH}"

# ========================
# --- Gradio Interface (Same as previous Sentiment Agent) ---
# ========================
print("Initializing Sentiment CoordinatorAgent for Gradio...")
coordinator = CoordinatorAgent() # This will trigger data download & training if needed
print("Sentiment CoordinatorAgent Initialized.")

def gradio_analyze_sentiment(review_text): return coordinator.process_review(review_text)
def gradio_retrain_sentiment_model(max_samples_str, epochs_str):
    try:
        max_samples = None if not max_samples_str.strip() else int(max_samples_str)
        global EPOCHS; EPOCHS = int(epochs_str)
        if max_samples is not None and max_samples < 1000: return "Min 1000 samples."
        if not 1 <= EPOCHS <= 5: return "Epochs: 1-5."
    except ValueError: return "Invalid numeric input."
    print(f"UI Retrain Sentiment: max_samples={max_samples}, epochs={EPOCHS}.")
    # Optional: Reset model for full retrain from scratch from UI
    # coordinator.model_agent._initialize_new_model()
    return coordinator.full_data_pipeline_and_train(max_samples=max_samples)

with gr.Blocks(title="😊 Sentiment Analysis Agent (Clearer)", theme=gr.themes.Monochrome()) as demo:
    gr.Markdown("# 😊 Customer Review Sentiment Analysis Agent (Clearer Flow)")
    gr.Markdown(f"Analyzes customer reviews using `{MODEL_NAME}`. Dataset: `{KAGGLE_DATASET_SLUG}`")
    with gr.Tabs():
        with gr.Tab("💬 Analyze Review"):
            review_input = gr.Textbox(label="Enter Customer Review", lines=5, placeholder="e.g., This airline is the worst, lost my luggage!")
            analyze_button = gr.Button("Analyze Sentiment", variant="primary")
            sentiment_output = gr.Textbox(label="Sentiment Assessment", interactive=False)
            suggestion_output = gr.Textbox(label="Suggested Follow-up", interactive=False, lines=2)
        with gr.Tab("⚙️ Model Management"):
            model_status_button = gr.Button("Refresh Model Status")
            model_status_output = gr.Textbox(label="Current Model Status", lines=4, interactive=False)
            gr.Markdown("---"); gr.Markdown("### 🔄 Re-train Model")
            gr.Markdown(f"Re-downloads data from Kaggle (`{KAGGLE_DATASET_SLUG}`) and fine-tunes.")
            with gr.Row():
                retrain_samples = gr.Textbox(label="Max Samples (blank=all)", placeholder="e.g., 10000")
                retrain_epochs = gr.Textbox(label="Epochs", value=str(EPOCHS))
            retrain_button = gr.Button("Start Full Re-training", variant="stop")
            retrain_status_output = gr.Textbox(label="Re-training Status", interactive=False, lines=2)

    analyze_button.click(gradio_analyze_sentiment, inputs=review_input, outputs=[sentiment_output, suggestion_output])
    model_status_button.click(coordinator.get_model_status, inputs=None, outputs=model_status_output)
    retrain_button.click(gradio_retrain_sentiment_model, inputs=[retrain_samples, retrain_epochs], outputs=retrain_status_output)
    demo.load(coordinator.get_model_status, inputs=None, outputs=model_status_output)

print(f"🚀 Launching Sentiment Analysis Agent System (Dataset: {KAGGLE_DATASET_SLUG}, Clearer Flow)...")
demo.launch(debug=True, share=True)

Initializing Sentiment CoordinatorAgent for Gradio...
🤖 SentimentModelAgent on: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  self.scaler = GradScaler(enabled=USE_AMP)
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized new base sentiment model.
Coordinator: Sentiment model not trained. Initiating training...
Coordinator: Starting full data pipeline (max_samples=None)...
Downloading Kaggle dataset 'crowdflower/twitter-airline-sentiment' to '/content/kaggle_data/airline_sentiment_clear'...
Executing: kaggle datasets download -d crowdflower/twitter-airline-sentiment -p /content/kaggle_data/airline_sentiment_clear --unzip --force
Kaggle dataset downloaded and unzipped successfully.
Successfully loaded /content/kaggle_data/airline_sentiment_clear/Tweets.csv. Shape: (14640, 15)
Processed data: 14640 samples. Unique labels found: {0, 1, 2}
🚀 Training sentiment model for 3 epochs. AMP: True


  with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask,labels=labs); loss = outputs.loss


  E1 B77/778 L:0.9399
  E1 B154/778 L:0.7408
  E1 B231/778 L:0.7236
  E1 B308/778 L:0.8143
  E1 B385/778 L:0.2346
  E1 B462/778 L:0.3682
  E1 B539/778 L:0.4313
  E1 B616/778 L:0.2051
  E1 B693/778 L:0.3747
  E1 B770/778 L:0.5835

  E1 AvgTrainL:0.5757


  with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask)


--- Validation Report E1 ---
              precision    recall  f1-score   support

    negative       0.90      0.89      0.89      1377
     neutral       0.67      0.65      0.66       465
    positive       0.72      0.77      0.75       354

    accuracy                           0.82      2196
   macro avg       0.76      0.77      0.77      2196
weighted avg       0.82      0.82      0.82      2196

--- End Report ---


  with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask,labels=labs); loss = outputs.loss


  E2 B77/778 L:0.2076
  E2 B154/778 L:0.3799
  E2 B231/778 L:0.2622
  E2 B308/778 L:0.1148
  E2 B385/778 L:0.2793
  E2 B462/778 L:1.0063
  E2 B539/778 L:1.0021
  E2 B616/778 L:0.0708
  E2 B693/778 L:0.2685
  E2 B770/778 L:0.3855

  E2 AvgTrainL:0.3394


  with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask)


--- Validation Report E2 ---
              precision    recall  f1-score   support

    negative       0.89      0.91      0.90      1377
     neutral       0.67      0.68      0.68       465
    positive       0.80      0.71      0.75       354

    accuracy                           0.83      2196
   macro avg       0.79      0.77      0.77      2196
weighted avg       0.83      0.83      0.83      2196

--- End Report ---


  with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask,labels=labs); loss = outputs.loss


  E3 B77/778 L:0.3381
  E3 B154/778 L:0.0564
  E3 B231/778 L:0.4615
  E3 B308/778 L:0.0392
  E3 B385/778 L:0.2823
  E3 B462/778 L:0.2216
  E3 B539/778 L:0.2098
  E3 B616/778 L:0.4231
  E3 B693/778 L:0.0614
  E3 B770/778 L:0.2270

  E3 AvgTrainL:0.2309


  with autocast(enabled=USE_AMP): outputs = self.model(ids,attention_mask=mask)


--- Validation Report E3 ---
              precision    recall  f1-score   support

    negative       0.89      0.91      0.90      1377
     neutral       0.68      0.68      0.68       465
    positive       0.78      0.72      0.75       354

    accuracy                           0.83      2196
   macro avg       0.79      0.77      0.78      2196
weighted avg       0.83      0.83      0.83      2196

--- End Report ---
💾 Sentiment model saved to /content/drive/My Drive/Colab_Models/SentimentAnalysisAgent_Airline_Clear
✅ Sentiment training done.
Sentiment CoordinatorAgent Initialized.
🚀 Launching Sentiment Analysis Agent System (Dataset: crowdflower/twitter-airline-sentiment, Clearer Flow)...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ecedbae01d723a9809.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio 

  with torch.no_grad(), autocast(enabled=USE_AMP): logits = self.model(ids,attention_mask=mask).logits


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ecedbae01d723a9809.gradio.live


