In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import RobertaTokenizerFast, RobertaModel, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, accuracy_score, f1_score
import os
import copy # Needed to copy the best model weights

# ==========================================
# 0. PATH TO YOUR RETRAINED MODEL
# ==========================================
# Change this if your retrained model is at a different path
RETRAINED_MODEL_PATH = "/content/roberta_pretrained_continued"

# ==========================================
# 1. CONFIGURATION (AGGRESSIVE TUNING)
# ==========================================
CONFIG = {
    'filename': 'polygon_news(1).csv',
    'model_name': 'roberta-base',   # kept for compatibility but not used to load weights
    'max_len': 160,
    'batch_size': 16,
    'epochs': 4,             # Give it 1 extra epoch to find the peak
    'learning_rate': 3e-5,   # <--- HIGHER RATE (Aggressive)
    'num_classes': 3,
    'top_k_attributes': 200
}

# Manual safety list
MANUAL_ATTRIBUTES = [
    "profit", "loss", "revenue", "margin", "earnings", "sales", "net", "income",
    "debt", "growth", "decline", "shares", "stock", "market", "dividend",
    "acquisition", "merger", "quarter", "fiscal", "guidance", "forecast",
    "beat", "missed", "record", "high", "low", "bullish", "bearish", "volatility",
    "inflation", "rates", "fed", "rally", "crash", "plunge", "surge", "eps",
    "yield", "default", "liquidity", "sec", "ipo", "nasdaq", "dow", "sp500"
]

# ==========================================
# 2. PHASE 1: SMART ATTRIBUTE DISCOVERY
# ==========================================
def generate_smart_attributes(df, top_k=200):
    print(f"--- PHASE 1: Generating Smart Attribute List (Top {top_k}) ---")

    text_data = (df['title'].fillna('') + " " +
                 df['description'].fillna('') + " " +
                 df['keywords'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))).tolist()

    labels = df['sentiment'].tolist()

    vectorizer = CountVectorizer(stop_words='english', min_df=5, max_features=10000)
    X = vectorizer.fit_transform(text_data)
    feature_names = vectorizer.get_feature_names_out()

    ch2 = SelectKBest(chi2, k=top_k)
    ch2.fit(X, labels)

    top_indices = ch2.get_support(indices=True)
    smart_attributes = [feature_names[i] for i in top_indices]

    print(f"Found {len(smart_attributes)} statistically relevant attributes.")
    return list(set(smart_attributes + MANUAL_ATTRIBUTES))

# ==========================================
# 3. DATASET CLASS
# ==========================================
class FinancialKeywordDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, attributes):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.attributes = set(attributes)

        titles = dataframe['title'].fillna('').astype(str)
        descs = dataframe['description'].fillna('').astype(str)

        def clean_keywords(val):
            if isinstance(val, list): return " ".join(val)
            return str(val) if pd.notna(val) else ""

        keywords = dataframe['keywords'].apply(clean_keywords)

        self.text = (titles + " " + descs + " " + keywords).values
        self.labels = dataframe['sentiment'].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.text[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        # --- INNOVATION MASK ---
        keyword_mask = torch.zeros_like(input_ids, dtype=torch.float)
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)

        found_keyword = False
        for i, token in enumerate(tokens):
            # Roberta fast tokenizer prefixes word tokens with 'Ġ' — strip it for comparison
            clean_token = token.replace('Ġ', '').lower()
            if clean_token in self.attributes:
                keyword_mask[i] = 1.0
                found_keyword = True

        if not found_keyword:
            # fallback: mark CLS / first token
            keyword_mask[0] = 1.0

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'keyword_mask': keyword_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ==========================================
# 4. MODEL ARCHITECTURE
# ==========================================
class KeywordFocusedRoberta(nn.Module):
    def __init__(self, n_classes):
        super(KeywordFocusedRoberta, self).__init__()
        # load the retrained roberta weights here (keeps architecture identical)
        self.roberta = RobertaModel.from_pretrained(RETRAINED_MODEL_PATH)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.roberta.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, keyword_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # --- APPLY MASK ---
        expanded_mask = keyword_mask.unsqueeze(-1)
        masked_embeddings = last_hidden_state * expanded_mask

        sum_embeddings = torch.sum(masked_embeddings, dim=1)
        sum_mask = torch.clamp(expanded_mask.sum(dim=1), min=1e-9)
        pooled_output = sum_embeddings / sum_mask

        output = self.drop(pooled_output)
        return self.out(output)

# ==========================================
# 5. HELPER: EVALUATION FUNCTION
# ==========================================
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            keyword_mask = batch['keyword_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, keyword_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return accuracy_score(all_labels, all_preds)

# ==========================================
# 6. MAIN PIPELINE
# ==========================================
def run_pipeline():
    # --- STEP 1: LOAD DATA ---
    filename = CONFIG['filename']
    if not os.path.exists(filename):
        print(f"❌ ERROR: File '{filename}' not found.")
        return

    print(f"Loading dataset: {filename}...")
    try:
        df = pd.read_csv(filename, encoding='utf-8')
    except UnicodeDecodeError:
        print("UTF-8 failed, trying Latin-1...")
        df = pd.read_csv(filename, encoding='latin-1')

    df.columns = [c.lower() for c in df.columns]

    valid_classes = ['positive', 'negative', 'neutral']
    df = df[df['sentiment'].isin(valid_classes)].copy()

    le = LabelEncoder()
    df['sentiment'] = le.fit_transform(df['sentiment'])
    target_names = [str(cls) for cls in le.classes_]
    print(f"Classes: {target_names}")

    # --- STEP 2: PHASE 1 (Attributes) ---
    smart_attributes = generate_smart_attributes(df, top_k=CONFIG['top_k_attributes'])

    # --- STEP 3: PREPARE DATASETS ---
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])

    # Use the retrained tokenizer
    tokenizer = RobertaTokenizerFast.from_pretrained(RETRAINED_MODEL_PATH)

    train_dataset = FinancialKeywordDataset(df_train, tokenizer, CONFIG['max_len'], smart_attributes)
    test_dataset = FinancialKeywordDataset(df_test, tokenizer, CONFIG['max_len'], smart_attributes)

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

    # --- STEP 4: PHASE 2 (Training with Save Best) ---
    print("\n--- PHASE 2: Training with 'Save Best' Logic ---")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = KeywordFocusedRoberta(CONFIG['num_classes']).to(device)

    # Aggressive Tuning: Higher LR, Higher Weight Decay
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=0.02)

    total_steps = len(train_loader) * CONFIG['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )
    criterion = nn.CrossEntropyLoss()

    best_accuracy = 0.0
    best_model_wts = copy.deepcopy(model.state_dict()) # Store best weights

    for epoch in range(CONFIG['epochs']):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            keyword_mask = batch['keyword_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, keyword_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        # Check accuracy at end of epoch
        val_acc = evaluate(model, test_loader, device)
        avg_loss = total_loss/len(train_loader)
        print(f"Epoch {epoch+1}/{CONFIG['epochs']} | Loss: {avg_loss:.4f} | Val Acc: {val_acc:.4f}")

        # SAVE IF BETTER
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            print(f"  >>> New Best Model Found! (Acc: {best_accuracy:.4f})")

    # --- STEP 5: PHASE 3 (Final Evaluation of BEST Model) ---
    print("\n--- PHASE 3: Final Evaluation (Best Model) ---")

    # Reload the best weights we found
    model.load_state_dict(best_model_wts)
    model.eval()

    all_preds = []
    all_labels = []
    total_test_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            keyword_mask = batch['keyword_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, keyword_mask)
            loss = criterion(outputs, labels)
            total_test_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_test_loss = total_test_loss / len(test_loader)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"\nFinal Test Results (Best Epoch):")
    print(f"Loss = {avg_test_loss:.4f} | Accuracy = {acc:.4f} | F1 Score = {f1:.4f}")
    print("-" * 50)
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=target_names, zero_division=0))

if __name__ == "__main__":
    run_pipeline()