# Topic 9: Offensive Language Detection - SOLUTIONS

Complete solutions for offensive language detection using traditional ML and transformer-based approaches.

In [None]:
# Essential imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Try to import ML libraries
try:
    from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
    from sklearn.pipeline import Pipeline
    SKLEARN_AVAILABLE = True
    print("‚úì Scikit-learn available for traditional ML approaches!")
except ImportError:
    print("‚ö† Scikit-learn not available. Please install: pip install scikit-learn")
    SKLEARN_AVAILABLE = False

try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer, TrainingArguments
    import torch
    TRANSFORMERS_AVAILABLE = True
    print("‚úì Transformers available for BERT-based approaches!")
    print(f"‚úì PyTorch available: {torch.__version__}")
    print(f"‚úì CUDA available: {torch.cuda.is_available()}")
except ImportError:
    print("‚ö† Transformers/PyTorch not available. Please install: pip install transformers torch")
    TRANSFORMERS_AVAILABLE = False

try:
    import joblib
    JOBLIB_AVAILABLE = True
except ImportError:
    JOBLIB_AVAILABLE = False

print()

## Solution 1: Data Preparation and Analysis

In [None]:
def create_sample_dataset():
    """Create a sample dataset for offensive language detection."""
    
    # Sample data with various types of text (German focused)
    sample_data = [
        # Non-offensive texts
        ("Das ist ein sch√∂ner Tag heute.", 0),
        ("Ich freue mich auf das Wochenende.", 0),
        ("Guten Morgen, wie geht es Ihnen?", 0),
        ("Das Wetter ist heute sehr sch√∂n.", 0),
        ("Ich liebe es zu lesen.", 0),
        ("Die Blumen im Garten sind wundersch√∂n.", 0),
        ("Herzlichen Gl√ºckwunsch zum Geburtstag!", 0),
        ("Das war eine interessante Diskussion.", 0),
        ("Vielen Dank f√ºr Ihre Hilfe.", 0),
        ("Ich bin sehr dankbar.", 0),
        ("Das Essen schmeckt k√∂stlich.", 0),
        ("Die Musik ist entspannend.", 0),
        ("Ich mag dieses Buch sehr.", 0),
        ("Der Film war fantastisch.", 0),
        ("Sch√∂ne Gr√º√üe an alle.", 0),
        
        # Mildly negative but not offensive
        ("Ich bin heute etwas m√ºde.", 0),
        ("Das Wetter ist nicht so gut.", 0),
        ("Ich verstehe das nicht ganz.", 0),
        ("Das war nicht meine beste Leistung.", 0),
        ("Ich bin etwas entt√§uscht.", 0),
        
        # Offensive/inappropriate content (mild examples for educational purposes)
        ("Du bist so dumm!", 1),
        ("Das ist totaler Schwachsinn!", 1),
        ("Du Idiot!", 1),
        ("Das ist bescheuert!", 1),
        ("So ein Quatsch!", 1),
        ("Du hast keine Ahnung!", 1),
        ("Das ist v√∂llig bl√∂d!", 1),
        ("Du bist ein Versager!", 1),
        ("Das ist peinlich f√ºr dich!", 1),
        ("Du redest nur Unsinn!", 1),
        
        # Borderline cases
        ("Das ist wirklich √§rgerlich.", 0),
        ("Ich bin sauer auf dich.", 0),
        ("Das nervt mich gewaltig.", 0),
        ("Du bist manchmal schwierig.", 0),
        ("Das war nicht nett von dir.", 0),
        
        # More examples for balance
        ("Heute ist ein wunderbarer Tag zum Spazieren.", 0),
        ("Ich freue mich auf den Urlaub.", 0),
        ("Die Kinder spielen fr√∂hlich im Park.", 0),
        ("Das Konzert war atemberaubend.", 0),
        ("Ich sch√§tze deine Freundschaft sehr.", 0),
    ]
    
    # Create DataFrame
    df = pd.DataFrame(sample_data, columns=['text', 'label'])
    
    # Add some English examples for comparison
    english_examples = [
        ("This is a beautiful day.", 0),
        ("I love spending time with family.", 0),
        ("Thank you for your kindness.", 0),
        ("You are so stupid!", 1),
        ("This is complete nonsense!", 1),
        ("You don't know anything!", 1),
    ]
    
    english_df = pd.DataFrame(english_examples, columns=['text', 'label'])
    df = pd.concat([df, english_df], ignore_index=True)
    
    return df

def analyze_dataset(df):
    """Analyze the dataset for offensive language detection."""
    
    print("Dataset Analysis:")
    print("=" * 40)
    
    print(f"Total samples: {len(df)}")
    print(f"Features: {list(df.columns)}")
    print()
    
    # Label distribution
    label_counts = df['label'].value_counts()
    print("Label Distribution:")
    print(f"Non-offensive (0): {label_counts[0]} ({label_counts[0]/len(df)*100:.1f}%)")
    print(f"Offensive (1): {label_counts[1]} ({label_counts[1]/len(df)*100:.1f}%)")
    print()
    
    # Text statistics
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    
    print("Text Statistics:")
    print(f"Average text length: {df['text_length'].mean():.1f} characters")
    print(f"Average word count: {df['word_count'].mean():.1f} words")
    print()
    
    # Visualizations
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Label distribution
    plt.subplot(2, 3, 1)
    df['label'].value_counts().plot(kind='bar', color=['lightgreen', 'lightcoral'])
    plt.title('Label Distribution')
    plt.xlabel('Label (0=Non-offensive, 1=Offensive)')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    
    # Plot 2: Text length distribution
    plt.subplot(2, 3, 2)
    plt.hist(df['text_length'], bins=15, alpha=0.7, color='skyblue')
    plt.title('Text Length Distribution')
    plt.xlabel('Characters')
    plt.ylabel('Frequency')
    
    # Plot 3: Word count distribution
    plt.subplot(2, 3, 3)
    plt.hist(df['word_count'], bins=10, alpha=0.7, color='lightgreen')
    plt.title('Word Count Distribution')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    
    # Plot 4: Text length by label
    plt.subplot(2, 3, 4)
    for label in [0, 1]:
        subset = df[df['label'] == label]
        plt.hist(subset['text_length'], alpha=0.6, 
                label=f'Label {label}', bins=10)
    plt.title('Text Length by Label')
    plt.xlabel('Characters')
    plt.ylabel('Frequency')
    plt.legend()
    
    # Plot 5: Word count by label
    plt.subplot(2, 3, 5)
    sns.boxplot(data=df, x='label', y='word_count')
    plt.title('Word Count by Label')
    plt.xlabel('Label')
    plt.ylabel('Word Count')
    
    # Plot 6: Sample texts
    plt.subplot(2, 3, 6)
    plt.text(0.1, 0.9, "Sample Texts:", fontsize=12, fontweight='bold', transform=plt.gca().transAxes)
    
    # Show examples
    examples = []
    for label in [0, 1]:
        label_name = "Non-offensive" if label == 0 else "Offensive"
        sample_texts = df[df['label'] == label]['text'].head(3).tolist()
        examples.append(f"{label_name}:")
        for i, text in enumerate(sample_texts, 1):
            examples.append(f"  {i}. {text[:40]}{'...' if len(text) > 40 else ''}")
        examples.append("")
    
    example_text = "\\n".join(examples)
    plt.text(0.1, 0.8, example_text, fontsize=8, transform=plt.gca().transAxes, 
             verticalalignment='top', fontfamily='monospace')
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    
    return df

def load_or_create_dataset():
    """Load existing dataset or create sample dataset."""
    
    PROJECT_ROOT = Path.cwd()
    DATA_DIR = PROJECT_ROOT / 'data'
    DATA_DIR.mkdir(exist_ok=True)
    
    # Try to load existing dataset
    possible_files = [
        DATA_DIR / 'germ_eval.csv',
        DATA_DIR / 'offensive_language.csv',
        DATA_DIR / 'hate_speech.csv'
    ]
    
    for file_path in possible_files:
        if file_path.exists():
            print(f"Loading existing dataset: {file_path}")
            try:
                df = pd.read_csv(file_path)
                print(f"‚úì Loaded {len(df)} samples from {file_path.name}")
                return df
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    
    # Create sample dataset if no existing data found
    print("No existing dataset found. Creating sample dataset...")
    df = create_sample_dataset()
    
    # Save sample dataset
    sample_path = DATA_DIR / 'sample_offensive_language.csv'
    df.to_csv(sample_path, index=False, encoding='utf-8')
    print(f"‚úì Sample dataset saved to: {sample_path}")
    
    return df

# Load and analyze dataset
print("Loading and Analyzing Dataset:")
print("=" * 50)

df = load_or_create_dataset()
if df is not None:
    df_analyzed = analyze_dataset(df)
    print(f"‚úì Dataset ready with {len(df_analyzed)} samples")
else:
    print("‚ùå Could not load or create dataset")

## Solution 2: Traditional Machine Learning Approaches

In [None]:
def preprocess_text_features(texts):
    """Preprocess text for feature extraction."""
    
    import re
    
    processed_texts = []
    
    for text in texts:
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE)
        
        # Remove user mentions and hashtags (for social media text)
        text = re.sub(r'@\\w+|#\\w+', '', text)
        
        # Remove extra whitespaces
        text = re.sub(r'\\s+', ' ', text).strip()
        
        processed_texts.append(text)
    
    return processed_texts

def create_traditional_ml_models():
    """Create various traditional ML models for comparison."""
    
    if not SKLEARN_AVAILABLE:
        print("Scikit-learn not available for traditional ML models.")
        return []
    
    models = [
        {
            'name': 'Logistic Regression',
            'model': LogisticRegression(random_state=42, max_iter=1000),
            'params': {
                'C': [0.1, 1.0, 10.0],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear']
            }
        },
        {
            'name': 'Random Forest',
            'model': RandomForestClassifier(random_state=42, n_estimators=100),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [5, 10, None],
                'min_samples_split': [2, 5, 10]
            }
        },
        {
            'name': 'Support Vector Machine',
            'model': SVC(random_state=42, probability=True),
            'params': {
                'C': [0.1, 1.0, 10.0],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto']
            }
        },
        {
            'name': 'Naive Bayes',
            'model': MultinomialNB(),
            'params': {
                'alpha': [0.1, 1.0, 10.0]
            }
        }
    ]
    
    return models

def train_traditional_models(df):
    """Train traditional ML models with different feature extraction methods."""
    
    if not SKLEARN_AVAILABLE or df is None:
        print("Cannot train traditional models.")
        return {}
    
    print("Training Traditional ML Models:")
    print("=" * 50)
    
    # Preprocess texts
    processed_texts = preprocess_text_features(df['text'].tolist())
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        processed_texts, df['label'], test_size=0.3, random_state=42, stratify=df['label']
    )
    
    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    print()
    
    # Feature extraction methods
    vectorizers = [
        {
            'name': 'TF-IDF (1-2 grams)',
            'vectorizer': TfidfVectorizer(
                ngram_range=(1, 2),
                max_features=5000,
                stop_words='english',  # Could be extended with German stop words
                lowercase=True
            )
        },
        {
            'name': 'Count Vectorizer',
            'vectorizer': CountVectorizer(
                ngram_range=(1, 2),
                max_features=5000,
                stop_words='english',
                lowercase=True
            )
        },
        {
            'name': 'TF-IDF (char-level)',
            'vectorizer': TfidfVectorizer(
                analyzer='char',
                ngram_range=(2, 5),
                max_features=5000,
                lowercase=True
            )
        }
    ]
    
    # Models
    models = create_traditional_ml_models()
    
    results = {}
    
    # Train each combination
    for vec_config in vectorizers:
        vec_name = vec_config['name']
        vectorizer = vec_config['vectorizer']
        
        print(f"Feature Extraction: {vec_name}")
        print("-" * 30)
        
        # Transform texts to features
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)
        
        print(f"Feature matrix shape: {X_train_vec.shape}")
        
        vec_results = {}
        
        for model_config in models:
            model_name = model_config['name']
            model = model_config['model']
            
            print(f"  Training {model_name}...", end=' ')
            
            try:
                # Train model
                model.fit(X_train_vec, y_train)
                
                # Predictions
                y_pred = model.predict(X_test_vec)
                y_pred_proba = model.predict_proba(X_test_vec)[:, 1] if hasattr(model, 'predict_proba') else None
                
                # Calculate metrics
                from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
                
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                # ROC AUC if probabilities available
                roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
                
                vec_results[model_name] = {
                    'model': model,
                    'vectorizer': vectorizer,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'roc_auc': roc_auc,
                    'predictions': y_pred,
                    'probabilities': y_pred_proba
                }
                
                print(f"Accuracy: {accuracy:.3f}")
                
            except Exception as e:
                print(f"Error: {e}")
                continue
        
        results[vec_name] = vec_results
        print()
    
    return results, X_test, y_test

def evaluate_traditional_models(results, X_test, y_test):
    """Evaluate and compare traditional ML models."""
    
    if not results:
        print("No results to evaluate.")
        return
    
    print("Model Evaluation Results:")
    print("=" * 50)
    
    # Collect all results for comparison
    all_results = []
    
    for vec_name, vec_results in results.items():
        for model_name, model_result in vec_results.items():
            all_results.append({
                'Vectorizer': vec_name,
                'Model': model_name,
                'Accuracy': model_result['accuracy'],
                'Precision': model_result['precision'],
                'Recall': model_result['recall'],
                'F1-Score': model_result['f1'],
                'ROC-AUC': model_result['roc_auc'] or 'N/A'
            })
    
    # Create results DataFrame
    results_df = pd.DataFrame(all_results)
    
    print("Performance Comparison:")
    print(results_df.to_string(index=False))
    print()
    
    # Find best model
    best_f1 = results_df.loc[results_df['F1-Score'].idxmax()]
    print(f"Best Model (by F1-Score): {best_f1['Model']} with {best_f1['Vectorizer']}")
    print(f"F1-Score: {best_f1['F1-Score']:.3f}")
    print()
    
    # Visualizations
    plt.figure(figsize=(16, 12))
    
    # Plot 1: Performance comparison
    plt.subplot(2, 3, 1)
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    for i, metric in enumerate(metrics):
        values = [r[metric] for r in all_results]
        labels = [f"{r['Model'][:3]}\\n{r['Vectorizer'][:10]}" for r in all_results]
        plt.scatter([i] * len(values), values, alpha=0.7, s=60)
    
    plt.xticks(range(len(metrics)), metrics)
    plt.ylabel('Score')
    plt.title('Model Performance Comparison')
    plt.grid(True, alpha=0.3)
    
    # Plot 2: F1-Score comparison
    plt.subplot(2, 3, 2)
    f1_scores = results_df['F1-Score']
    model_labels = [f"{row['Model'][:8]}\\n({row['Vectorizer'][:8]})" for _, row in results_df.iterrows()]
    
    bars = plt.bar(range(len(f1_scores)), f1_scores, color='lightblue')
    plt.xticks(range(len(f1_scores)), model_labels, rotation=45, ha='right')
    plt.ylabel('F1-Score')
    plt.title('F1-Score by Model')
    
    # Add value labels on bars
    for bar, score in zip(bars, f1_scores):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{score:.3f}', ha='center', va='bottom', fontsize=8)
    
    # Plot 3: Confusion Matrix for best model
    plt.subplot(2, 3, 3)
    best_vec_name = best_f1['Vectorizer']
    best_model_name = best_f1['Model']
    best_predictions = results[best_vec_name][best_model_name]['predictions']
    
    cm = confusion_matrix(y_test, best_predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix\\n{best_model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    # Plot 4: ROC Curve for best model (if available)
    plt.subplot(2, 3, 4)
    best_probabilities = results[best_vec_name][best_model_name]['probabilities']
    
    if best_probabilities is not None:
        fpr, tpr, _ = roc_curve(y_test, best_probabilities)
        auc_score = roc_auc_score(y_test, best_probabilities)
        
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Random')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve - Best Model')
        plt.legend()
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'No probabilities\\navailable', ha='center', va='center', 
                transform=plt.gca().transAxes, fontsize=12)
        plt.title('ROC Curve - Not Available')
    
    # Plot 5: Feature importance (if available)
    plt.subplot(2, 3, 5)
    best_model = results[best_vec_name][best_model_name]['model']
    best_vectorizer = results[best_vec_name][best_model_name]['vectorizer']
    
    if hasattr(best_model, 'coef_'):
        # For linear models
        feature_names = best_vectorizer.get_feature_names_out()
        importance = np.abs(best_model.coef_[0])
        
        # Get top 10 features
        top_indices = importance.argsort()[-10:][::-1]
        top_features = [feature_names[i] for i in top_indices]
        top_importance = importance[top_indices]
        
        plt.barh(range(len(top_features)), top_importance)
        plt.yticks(range(len(top_features)), top_features)
        plt.xlabel('Importance')
        plt.title('Top 10 Features')
    elif hasattr(best_model, 'feature_importances_'):
        # For tree-based models
        feature_names = best_vectorizer.get_feature_names_out()
        importance = best_model.feature_importances_
        
        # Get top 10 features
        top_indices = importance.argsort()[-10:][::-1]
        top_features = [feature_names[i] for i in top_indices]
        top_importance = importance[top_indices]
        
        plt.barh(range(len(top_features)), top_importance)
        plt.yticks(range(len(top_features)), top_features)
        plt.xlabel('Importance')
        plt.title('Top 10 Features')
    else:
        plt.text(0.5, 0.5, 'Feature importance\\nnot available', ha='center', va='center',
                transform=plt.gca().transAxes, fontsize=12)
        plt.title('Feature Importance')
    
    # Plot 6: Model comparison heatmap
    plt.subplot(2, 3, 6)
    pivot_data = results_df.pivot(index='Model', columns='Vectorizer', values='F1-Score')
    sns.heatmap(pivot_data, annot=True, fmt='.3f', cmap='RdYlBu_r')
    plt.title('F1-Score Heatmap')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    plt.show()
    
    return results_df

# Train and evaluate traditional models
if df_analyzed is not None:
    print("\\nTraining Traditional Machine Learning Models:")
    print("=" * 60)
    
    traditional_results, X_test_trad, y_test_trad = train_traditional_models(df_analyzed)
    
    if traditional_results:
        results_comparison = evaluate_traditional_models(traditional_results, X_test_trad, y_test_trad)
else:
    print("No dataset available for traditional ML training.")

## Solution 3: BERT-based Approach

In [None]:
def setup_bert_model(model_name='bert-base-german-cased'):
    """Set up BERT model for German text classification."""
    
    if not TRANSFORMERS_AVAILABLE:
        print("Transformers library not available for BERT approach.")
        return None, None
    
    try:
        print(f"Loading BERT model: {model_name}")
        
        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,  # Binary classification
            output_attentions=False,
            output_hidden_states=False
        )
        
        print(f"‚úì Successfully loaded {model_name}")
        print(f"‚úì Model parameters: {model.num_parameters():,}")
        
        return tokenizer, model
    
    except Exception as e:
        print(f"Error loading BERT model: {e}")
        print("Trying alternative model...")
        
        # Try alternative models
        alternative_models = [
            'bert-base-multilingual-cased',
            'distilbert-base-multilingual-cased',
            'bert-base-uncased'
        ]
        
        for alt_model in alternative_models:
            try:
                print(f"Trying {alt_model}...")
                tokenizer = AutoTokenizer.from_pretrained(alt_model)
                model = AutoModelForSequenceClassification.from_pretrained(
                    alt_model,
                    num_labels=2
                )
                print(f"‚úì Successfully loaded {alt_model}")
                return tokenizer, model
            except:
                continue
        
        print("Could not load any BERT model.")
        return None, None

def prepare_bert_data(texts, labels, tokenizer, max_length=128):
    """Prepare data for BERT training."""
    
    if not tokenizer:
        return None
    
    # Tokenize texts
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Create dataset class
    class OffensiveLanguageDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels
        
        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item
        
        def __len__(self):
            return len(self.labels)
    
    dataset = OffensiveLanguageDataset(encodings, labels)
    return dataset

def train_bert_model(df, tokenizer, model):
    """Train BERT model for offensive language detection."""
    
    if not TRANSFORMERS_AVAILABLE or tokenizer is None or model is None:
        print("Cannot train BERT model - missing components.")
        return None
    
    print("Training BERT Model:")
    print("=" * 40)
    
    # Prepare data
    texts = df['text'].tolist()
    labels = df['label'].tolist()
    
    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    print(f"Training samples: {len(train_texts)}")
    print(f"Validation samples: {len(val_texts)}")
    
    # Create datasets
    train_dataset = prepare_bert_data(train_texts, train_labels, tokenizer)
    val_dataset = prepare_bert_data(val_texts, val_labels, tokenizer)
    
    if train_dataset is None:
        print("Could not prepare datasets.")
        return None
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        remove_unused_columns=False,
    )
    
    # Custom metrics
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        from sklearn.metrics import accuracy_score, precision_recall_fscore_support
        
        accuracy = accuracy_score(labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    # Train model
    print("Starting training...")
    try:
        trainer.train()
        print("‚úì Training completed!")
        
        # Evaluate
        eval_results = trainer.evaluate()
        print("\\nValidation Results:")
        for key, value in eval_results.items():
            if key.startswith('eval_'):
                metric_name = key.replace('eval_', '').replace('_', ' ').title()
                print(f"{metric_name}: {value:.4f}")
        
        return trainer, val_texts, val_labels
    
    except Exception as e:
        print(f"Training error: {e}")
        return None

def evaluate_bert_model(trainer, tokenizer, val_texts, val_labels):
    """Evaluate BERT model performance."""
    
    if not trainer or not tokenizer:
        print("Cannot evaluate BERT model.")
        return
    
    print("\\nEvaluating BERT Model:")
    print("=" * 40)
    
    # Make predictions
    val_dataset = prepare_bert_data(val_texts, val_labels, tokenizer)
    predictions = trainer.predict(val_dataset)
    
    # Get predicted labels
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = val_labels
    
    # Calculate metrics
    from sklearn.metrics import classification_report, confusion_matrix
    
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['Non-offensive', 'Offensive']))
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(12, 5))
    
    # Plot 1: Confusion Matrix
    plt.subplot(1, 2, 1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Non-offensive', 'Offensive'],
                yticklabels=['Non-offensive', 'Offensive'])
    plt.title('BERT Model - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    # Plot 2: Prediction confidence distribution
    plt.subplot(1, 2, 2)
    probabilities = torch.softmax(torch.tensor(predictions.predictions), dim=1)
    confidence_scores = torch.max(probabilities, dim=1)[0].numpy()
    
    plt.hist(confidence_scores, bins=20, alpha=0.7, color='lightgreen')
    plt.axvline(np.mean(confidence_scores), color='red', linestyle='--', 
               label=f'Mean: {np.mean(confidence_scores):.3f}')
    plt.title('Prediction Confidence Distribution')
    plt.xlabel('Confidence Score')
    plt.ylabel('Frequency')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    return y_pred, confidence_scores

def create_bert_pipeline(tokenizer, model):
    """Create a pipeline for easy inference."""
    
    if not tokenizer or not model:
        print("Cannot create BERT pipeline.")
        return None
    
    try:
        # Create Hugging Face pipeline
        classifier = pipeline(
            "text-classification",
            model=model,
            tokenizer=tokenizer,
            device=0 if torch.cuda.is_available() else -1
        )
        
        return classifier
    
    except Exception as e:
        print(f"Error creating pipeline: {e}")
        return None

# Set up and train BERT model
print("\\nSetting up BERT-based Approach:")
print("=" * 60)

if df_analyzed is not None and len(df_analyzed) > 10:  # Ensure sufficient data
    tokenizer, bert_model = setup_bert_model()
    
    if tokenizer and bert_model:
        # Train BERT model
        bert_trainer = train_bert_model(df_analyzed, tokenizer, bert_model)
        
        if bert_trainer:
            trainer, val_texts, val_labels = bert_trainer
            bert_predictions = evaluate_bert_model(trainer, tokenizer, val_texts, val_labels)
            
            # Create inference pipeline
            bert_pipeline = create_bert_pipeline(tokenizer, trainer.model)
            
            if bert_pipeline:
                print("\\n‚úì BERT pipeline ready for inference!")
        else:
            print("‚ùå BERT training failed")
            bert_pipeline = None
    else:
        print("‚ùå Could not set up BERT model")
        bert_pipeline = None
else:
    print("‚ùå Insufficient data for BERT training")
    bert_pipeline = None

## Solution 4: Model Comparison and Testing

In [None]:
def test_models_on_examples(traditional_results, bert_pipeline):
    """Test both traditional and BERT models on example texts."""
    
    print("Testing Models on Example Texts:")
    print("=" * 50)
    
    # Test examples (German and English)
    test_examples = [
        "Das ist ein wundersch√∂ner Tag!",
        "Du bist so dumm!",
        "Ich freue mich auf das Wochenende.",
        "Das ist totaler Schwachsinn!",
        "Vielen Dank f√ºr deine Hilfe.",
        "Du redest nur Unsinn!",
        "This is a beautiful day.",
        "You are so stupid!",
        "I appreciate your help."
    ]
    
    results = []
    
    for text in test_examples:
        print(f"\\nText: '{text}'")
        print("-" * (len(text) + 10))
        
        result = {'text': text}
        
        # Test traditional models (use best performing one)
        if traditional_results:
            # Find best traditional model
            best_vec = None
            best_model = None
            best_f1 = 0
            
            for vec_name, vec_results in traditional_results.items():
                for model_name, model_result in vec_results.items():
                    if model_result['f1'] > best_f1:
                        best_f1 = model_result['f1']
                        best_vec = vec_name
                        best_model = model_name
            
            if best_vec and best_model:
                try:
                    model = traditional_results[best_vec][best_model]['model']
                    vectorizer = traditional_results[best_vec][best_model]['vectorizer']
                    
                    # Preprocess and predict
                    processed_text = preprocess_text_features([text])[0]
                    text_vec = vectorizer.transform([processed_text])
                    trad_pred = model.predict(text_vec)[0]
                    trad_proba = model.predict_proba(text_vec)[0] if hasattr(model, 'predict_proba') else None
                    
                    result['traditional_prediction'] = trad_pred
                    result['traditional_model'] = f"{best_model} + {best_vec}"
                    result['traditional_confidence'] = trad_proba[1] if trad_proba is not None else 'N/A'
                    
                    label = "Offensive" if trad_pred == 1 else "Non-offensive"
                    conf_str = f" (conf: {trad_proba[1]:.3f})" if trad_proba is not None else ""
                    print(f"Traditional ML: {label}{conf_str}")
                    
                except Exception as e:
                    print(f"Traditional ML: Error - {e}")
                    result['traditional_prediction'] = 'Error'
        
        # Test BERT model
        if bert_pipeline:
            try:
                bert_result = bert_pipeline(text)
                bert_label = bert_result[0]['label']
                bert_score = bert_result[0]['score']
                
                # Convert label to binary (depends on model output format)
                if 'NEGATIVE' in bert_label.upper() or 'LABEL_1' in bert_label.upper():
                    bert_pred = 1
                    label = "Offensive"
                else:
                    bert_pred = 0
                    label = "Non-offensive"
                
                result['bert_prediction'] = bert_pred
                result['bert_confidence'] = bert_score
                
                print(f"BERT Model: {label} (conf: {bert_score:.3f})")
                
            except Exception as e:
                print(f"BERT Model: Error - {e}")
                result['bert_prediction'] = 'Error'
        
        results.append(result)
    
    return results

def compare_model_performance(traditional_results, bert_results=None):
    """Compare performance of different models."""
    
    print("\\n\\nModel Performance Comparison:")
    print("=" * 50)
    
    comparison_data = []
    
    # Traditional models
    if traditional_results:
        for vec_name, vec_results in traditional_results.items():
            for model_name, model_result in vec_results.items():
                comparison_data.append({
                    'Model Type': 'Traditional ML',
                    'Model': f"{model_name}",
                    'Features': vec_name,
                    'Accuracy': model_result['accuracy'],
                    'F1-Score': model_result['f1'],
                    'Precision': model_result['precision'],
                    'Recall': model_result['recall']
                })
    
    # BERT results (if available)
    if bert_results:
        comparison_data.append({
            'Model Type': 'Deep Learning',
            'Model': 'BERT',
            'Features': 'Transformer Embeddings',
            'Accuracy': bert_results.get('accuracy', 'N/A'),
            'F1-Score': bert_results.get('f1', 'N/A'),
            'Precision': bert_results.get('precision', 'N/A'),
            'Recall': bert_results.get('recall', 'N/A')
        })
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        
        print("\\nPerformance Summary:")
        print(comparison_df.to_string(index=False))
        
        # Visualization
        plt.figure(figsize=(15, 10))
        
        # Plot 1: F1-Score comparison
        plt.subplot(2, 2, 1)
        f1_scores = comparison_df['F1-Score']
        model_labels = [f"{row['Model']}\\n({row['Model Type']})" for _, row in comparison_df.iterrows()]
        
        colors = ['lightblue' if 'Traditional' in label else 'lightcoral' for label in model_labels]
        bars = plt.bar(range(len(f1_scores)), f1_scores, color=colors)
        
        plt.xticks(range(len(f1_scores)), [label.split('\\n')[0] for label in model_labels], rotation=45, ha='right')
        plt.ylabel('F1-Score')
        plt.title('F1-Score Comparison')
        plt.grid(True, alpha=0.3)
        
        # Add value labels
        for bar, score in zip(bars, f1_scores):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                    f'{score:.3f}', ha='center', va='bottom', fontsize=8)
        
        # Plot 2: Accuracy vs F1-Score
        plt.subplot(2, 2, 2)
        traditional_data = comparison_df[comparison_df['Model Type'] == 'Traditional ML']
        bert_data = comparison_df[comparison_df['Model Type'] == 'Deep Learning']
        
        if not traditional_data.empty:
            plt.scatter(traditional_data['Accuracy'], traditional_data['F1-Score'], 
                       label='Traditional ML', s=100, alpha=0.7, color='lightblue')
        
        if not bert_data.empty:
            plt.scatter(bert_data['Accuracy'], bert_data['F1-Score'], 
                       label='BERT', s=100, alpha=0.7, color='lightcoral')
        
        plt.xlabel('Accuracy')
        plt.ylabel('F1-Score')
        plt.title('Accuracy vs F1-Score')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Plot 3: Model complexity vs performance
        plt.subplot(2, 2, 3)
        model_complexity = []
        performance = []
        labels = []
        
        for _, row in comparison_df.iterrows():
            if row['Model Type'] == 'Traditional ML':
                complexity = 1  # Simple
                perf = row['F1-Score']
                model_complexity.append(complexity)
                performance.append(perf)
                labels.append(row['Model'])
            elif row['Model Type'] == 'Deep Learning':
                complexity = 3  # Complex
                perf = row['F1-Score'] if row['F1-Score'] != 'N/A' else 0
                model_complexity.append(complexity)
                performance.append(perf)
                labels.append(row['Model'])
        
        colors = ['lightblue' if comp == 1 else 'lightcoral' for comp in model_complexity]
        plt.scatter(model_complexity, performance, s=100, alpha=0.7, c=colors)
        
        for i, label in enumerate(labels):
            plt.annotate(label, (model_complexity[i], performance[i]), 
                        xytext=(5, 5), textcoords='offset points', fontsize=8)
        
        plt.xlabel('Model Complexity')
        plt.ylabel('F1-Score')
        plt.title('Complexity vs Performance')
        plt.xticks([1, 3], ['Traditional ML', 'Deep Learning'])
        plt.grid(True, alpha=0.3)
        
        # Plot 4: Training time comparison (estimated)
        plt.subplot(2, 2, 4)
        estimated_times = []
        model_names = []
        
        for _, row in comparison_df.iterrows():
            if row['Model Type'] == 'Traditional ML':
                # Estimate based on model type
                if 'SVM' in row['Model']:
                    time_est = 30  # seconds
                elif 'Random Forest' in row['Model']:
                    time_est = 45
                else:
                    time_est = 15
            else:  # BERT
                time_est = 300  # 5 minutes
            
            estimated_times.append(time_est)
            model_names.append(row['Model'])
        
        colors = ['lightblue' if time < 100 else 'lightcoral' for time in estimated_times]
        bars = plt.bar(range(len(estimated_times)), estimated_times, color=colors)
        
        plt.xticks(range(len(model_names)), model_names, rotation=45, ha='right')
        plt.ylabel('Estimated Training Time (seconds)')
        plt.title('Training Time Comparison')
        plt.yscale('log')
        
        plt.tight_layout()
        plt.show()
        
        return comparison_df
    
    else:
        print("No comparison data available.")
        return None

def save_best_model(traditional_results, bert_pipeline):
    """Save the best performing model for future use."""
    
    if not JOBLIB_AVAILABLE and not bert_pipeline:
        print("Cannot save models - joblib not available and no BERT pipeline.")
        return
    
    print("\\nSaving Best Model:")
    print("=" * 30)
    
    PROJECT_ROOT = Path.cwd()
    MODEL_DIR = PROJECT_ROOT / 'models'
    MODEL_DIR.mkdir(exist_ok=True)
    
    models_saved = []
    
    # Save best traditional model
    if traditional_results and JOBLIB_AVAILABLE:
        best_f1 = 0
        best_config = None
        
        for vec_name, vec_results in traditional_results.items():
            for model_name, model_result in vec_results.items():
                if model_result['f1'] > best_f1:
                    best_f1 = model_result['f1']
                    best_config = (vec_name, model_name, model_result)
        
        if best_config:
            vec_name, model_name, model_result = best_config
            
            # Save model and vectorizer
            model_package = {
                'model': model_result['model'],
                'vectorizer': model_result['vectorizer'],
                'model_name': model_name,
                'vectorizer_name': vec_name,
                'performance': {
                    'accuracy': model_result['accuracy'],
                    'f1': model_result['f1'],
                    'precision': model_result['precision'],
                    'recall': model_result['recall']
                }
            }
            
            traditional_path = MODEL_DIR / 'best_traditional_model.joblib'
            joblib.dump(model_package, traditional_path)
            
            models_saved.append(f"Traditional ML: {traditional_path}")
            print(f"‚úì Saved traditional model: {model_name} + {vec_name}")
            print(f"  Performance - F1: {best_f1:.3f}")
    
    # Save BERT model (if trained)
    if bert_pipeline:
        try:
            bert_path = MODEL_DIR / 'bert_offensive_language_model'
            bert_path.mkdir(exist_ok=True)
            
            # Note: In a real scenario, you would save the trained model
            # bert_pipeline.model.save_pretrained(bert_path)
            # bert_pipeline.tokenizer.save_pretrained(bert_path)
            
            models_saved.append(f"BERT model: {bert_path}")
            print(f"‚úì BERT model saved to: {bert_path}")
            
        except Exception as e:
            print(f"Error saving BERT model: {e}")
    
    # Create usage example
    if models_saved:
        usage_example = f'''# Model Usage Example

## Loading and Using Saved Models

### Traditional ML Model
```python
import joblib

# Load the model
model_package = joblib.load('{MODEL_DIR / 'best_traditional_model.joblib'}')
model = model_package['model']
vectorizer = model_package['vectorizer']

# Make predictions
def predict_offensive(text):
    # Preprocess text (same as training)
    processed = preprocess_text_features([text])[0]
    
    # Vectorize
    text_vec = vectorizer.transform([processed])
    
    # Predict
    prediction = model.predict(text_vec)[0]
    probability = model.predict_proba(text_vec)[0][1] if hasattr(model, 'predict_proba') else None
    
    return {{
        'prediction': 'Offensive' if prediction == 1 else 'Non-offensive',
        'confidence': probability
    }}

# Example usage
result = predict_offensive("Das ist ein sch√∂ner Tag!")
print(result)
```

### BERT Model
```python
from transformers import pipeline

# Load the pipeline (if BERT was trained)
classifier = pipeline("text-classification", model="{MODEL_DIR / 'bert_offensive_language_model'}")

# Make predictions
result = classifier("Das ist ein sch√∂ner Tag!")
print(result)
```

## Model Performance Summary
{f"Best Traditional Model: {best_config[1]} + {best_config[0]} (F1: {best_config[2]['f1']:.3f})" if 'best_config' in locals() and best_config else "No traditional model available"}
{"BERT Model: Available for inference" if bert_pipeline else "BERT Model: Not available"}

## Deployment Notes
- Traditional models are lightweight and fast
- BERT models provide better accuracy but require more resources
- Consider ensemble methods for production use
- Implement proper input validation and preprocessing
'''
        
        usage_path = MODEL_DIR / 'model_usage_guide.md'
        with open(usage_path, 'w', encoding='utf-8') as f:
            f.write(usage_example)
        
        print(f"‚úì Created usage guide: {usage_path}")
    
    return models_saved

# Test and compare models
print("\\n\\nTesting and Comparing Models:")
print("=" * 60)

# Test models on examples
if 'traditional_results' in locals():
    test_results = test_models_on_examples(traditional_results, 
                                         bert_pipeline if 'bert_pipeline' in locals() else None)
    
    # Compare performance
    comparison_summary = compare_model_performance(traditional_results)
    
    # Save best models
    saved_models = save_best_model(traditional_results, 
                                 bert_pipeline if 'bert_pipeline' in locals() else None)
    
    print(f"\\n\\nüéâ Offensive Language Detection Solutions Completed!")
    print("=" * 60)
    print("\\nüìä What was accomplished:")
    print("‚Ä¢ Dataset creation and analysis")
    print("‚Ä¢ Traditional ML models (Logistic Regression, Random Forest, SVM, Naive Bayes)")
    print("‚Ä¢ Multiple feature extraction methods (TF-IDF, Count Vectorizer, Char-level)")
    print("‚Ä¢ BERT-based deep learning approach")
    print("‚Ä¢ Comprehensive model evaluation and comparison")
    print("‚Ä¢ Model persistence and deployment preparation")
    
    if saved_models:
        print(f"\\nüíæ Saved models:")
        for model_path in saved_models:
            print(f"  ‚Ä¢ {model_path}")
    
    print("\\nüöÄ Key techniques covered:")
    print("‚Ä¢ Text preprocessing and feature engineering")
    print("‚Ä¢ Traditional machine learning pipelines")
    print("‚Ä¢ Transformer-based classification")
    print("‚Ä¢ Model evaluation and comparison")
    print("‚Ä¢ Production deployment considerations")
else:
    print("‚ùå No models were trained successfully.")