In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import lightgbm as lgb
import joblib

In [None]:
# CharBERT Model for Embedding Extraction
#Didnot used this embedding
class CharBERTEmbedding:
    def __init__(self, model_name='bert-base-uncased'):
        """
        Initialize CharBERT model for embedding extraction
        """
        self.tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
        self.model = transformers.BertModel.from_pretrained(model_name)
        self.model.eval()  # Set to evaluation mode

    def extract_embeddings(self, texts, max_length=512):
        """
        Extract embeddings using CharBERT
        """
        # Tokenize and get embeddings
        embeddings = []
        for text in texts:
            # Tokenize and prepare input
            inputs = self.tokenizer(
                text,
                return_tensors='pt',
                max_length=max_length,
                truncation=True,
                padding='max_length'
            )

            # Extract embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
                # Use [CLS] token embedding (first token)
                embedding = outputs.last_hidden_state[:, 0, :].numpy()
                embeddings.append(embedding.flatten())

        return np.array(embeddings)

In [None]:
# Late Fusion Classifier
class LateFusionClassifier:
    def __init__(self, charbert_embedding, tfidf_vectorizer):
        """
        Initialize Late Fusion Classifier
        """
        self.charbert_embedding = charbert_embedding
        self.tfidf_vectorizer = tfidf_vectorizer
        self.models = []

    def prepare_features(self, X_content):
        """
        Prepare features by combining CharBERT and TF-IDF
        """
        # Extract CharBERT embeddings
        charbert_features = self.charbert_embedding.extract_embeddings(X_content)

        # Extract TF-IDF features
        tfidf_features = self.tfidf_vectorizer.transform(X_content).toarray()

        # Concatenate features
        combined_features = np.hstack([charbert_features, tfidf_features])

        return combined_features

    def train(self, X_content, y, label_names):
        """
        Train late fusion models for each label
        """
        # Prepare combined features
        X_combined = self.prepare_features(X_content)

        # Train models for each label
        self.models = []
        cv_scores = []

        for i in range(y.shape[1]):
            # LightGBM parameters
            params = {
                'objective': 'binary',
                'metric': 'auc',
                'boosting_type': 'gbdt',
                'num_leaves': 31,
                'learning_rate': 0.05,
                'feature_fraction': 0.9
            }

            # Train model for this label
            model = lgb.LGBMClassifier(**params)
            model.fit(X_combined, y[:, i])

            self.models.append(model)

        return self

    def predict(self, X_content, threshold=0.5):
        """
        Make predictions using late fusion
        """
        # Prepare combined features
        X_combined = self.prepare_features(X_content)

        # Predict for each label
        predictions = []
        for model in self.models:
            label_preds = model.predict_proba(X_combined)[:, 1]
            predictions.append((label_preds > threshold).astype(int))

        return np.array(predictions).T

In [None]:
# Preprocessing and Utility Functions (similar to previous implementation)
def parse_techniques(x):
    technique_mapping = {
        'CharBERT + LateFusion': 'charbert_latefusion',
        # Add other mappings as needed
    }

    if isinstance(x, str):
        x = x.strip('[]')
        parsed_techs = [tech.strip().strip("'\"") for tech in x.split(',') if tech.strip()]
    elif isinstance(x, list):
        parsed_techs = x
    else:
        parsed_techs = []

    mapped_techs = [
        technique_mapping.get(tech.strip(), tech.lower().replace(' ', '_'))
        for tech in parsed_techs
    ]

    return mapped_techs if mapped_techs else ['no_techniques']

def evaluate_multilabel(y_true, y_pred, label_names):
    """
    Comprehensive multi-label evaluation
    """
    print("\nOverall Metrics:")
    print("-" * 50)
    print(f"Micro F1 Score: {f1_score(y_true, y_pred, average='micro'):.4f}")
    print(f"Macro F1 Score: {f1_score(y_true, y_pred, average='macro'):.4f}")

    print("\nDetailed Label Metrics:")
    print("-" * 50)
    for i, label in enumerate(label_names):
        label_true = y_true[:, i]
        label_pred = y_pred[:, i]

        print(f"\nMetrics for label '{label}':")
        print(f"F1 Score: {f1_score(label_true, label_pred, average='binary'):.4f}")

In [None]:
def main():
    # Load data
    train_df = df  # Assuming df is your training dataframe

    # Preprocess techniques
    train_df['techniques'] = train_df['techniques'].apply(parse_techniques)

    # Prepare TF-IDF Vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.preprocessing import MultiLabelBinarizer

    tfidf_vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1, 2)
    )

    # Prepare Multi-Label Binarizer
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(train_df['techniques'])

    # Initialize CharBERT Embedding
    charbert_embedding = CharBERTEmbedding()

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        train_df['content'], y, test_size=0.2, random_state=42
    )

    # Prepare TF-IDF
    tfidf_vectorizer.fit(train_df['content'])

    # Train Late Fusion Classifier
    late_fusion_classifier = LateFusionClassifier(
        charbert_embedding,
        tfidf_vectorizer
    )
    late_fusion_classifier.train(X_train, y_train, mlb.classes_)

    # Predict and Evaluate
    y_val_pred = late_fusion_classifier.predict(X_val)
    evaluate_multilabel(y_val, y_val_pred, mlb.classes_)

    # Optional: Save models
    joblib.dump(late_fusion_classifier, 'late_fusion_classifier.joblib')
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
    joblib.dump(mlb, 'multilabel_binarizer.joblib')

if __name__ == "__main__":
    main()