In [None]:
import re
import sys
import logging
import numpy as np
from typing import Dict, Any

import joblib
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from tqdm.auto import tqdm

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("testing.log"), logging.StreamHandler()],
)

class CybercrimeClassifier:
    def __init__(self, min_samples_per_class=2):
        # Download required NLTK resources
        nltk.download("punkt", quiet=True)
        nltk.download("stopwords", quiet=True)
        nltk.download("wordnet", quiet=True)

        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words("english"))
        # Add domain-specific stop words
        self.stop_words.update(['please', 'help', 'thank', 'thanks', 'sir', 'madam', 'kindly'])
        
        self.label_encoders = {
            "category": LabelEncoder(),
            "sub_category": LabelEncoder(),
        }
        self.models = {
            "category": None,
            "sub_category": None
        }
        self.min_samples_per_class = min_samples_per_class
        self.vectorizer_params = {
            'max_features': 10000,  # Increased from 5000
            'ngram_range': (1, 3),  # Added trigrams
            'min_df': 2,
            'max_df': 0.95,
            'analyzer': 'word',
            'token_pattern': r'\b\w+\b',  # Matches whole words
        }

    def preprocess_text(self, text: str) -> str:
        """Enhanced text preprocessing with domain-specific cleaning"""
        try:
            if not isinstance(text, str):
                text = str(text)

            # Convert to lowercase
            text = text.lower()
            
            # Remove URLs
            text = re.sub(r'http\S+|www\S+', '', text)
            
            # Remove email addresses
            text = re.sub(r'\S+@\S+', '', text)
            
            # Remove phone numbers (various formats)
            text = re.sub(r'\+?\d{10,}|\+?\d{3}[-\s]?\d{3}[-\s]?\d{4}', '', text)
            
            # Replace multiple spaces with single space
            text = re.sub(r'\s+', ' ', text)
            
            # Keep important punctuation that might indicate sentiment or emphasis
            text = re.sub(r'[^a-zA-Z\s!?.]', '', text)

            # Tokenize
            tokens = word_tokenize(text)

            # Remove stopwords and lemmatize, keep tokens longer than 2 characters
            tokens = [
                self.lemmatizer.lemmatize(token)
                for token in tokens
                if token not in self.stop_words and len(token) > 2
            ]

            # Add bigrams and trigrams for important phrases
            bigrams = [f"{tokens[i]}_{tokens[i+1]}" for i in range(len(tokens)-1)]
            trigrams = [f"{tokens[i]}_{tokens[i+1]}_{tokens[i+2]}" for i in range(len(tokens)-2)]
            
            processed_text = ' '.join(tokens + bigrams + trigrams)
            
            # Return empty string if processed text is too short
            if len(processed_text.split()) < 3:
                return ""
                
            return processed_text

        except Exception as e:
            logging.error(f"Error preprocessing text: {str(e)}")
            return ""

    def build_model(self, class_labels: np.ndarray) -> Pipeline:
        """Create an improved classification pipeline with hyperparameter tuning"""
        # Calculate class weights
        n_samples = len(class_labels)
        n_classes = len(np.unique(class_labels))
        
        # If severe class imbalance, adjust class weights
        if n_classes > 1:
            class_weights = dict(zip(
                np.unique(class_labels),
                n_samples / (n_classes * np.bincount(class_labels))
            ))
        else:
            class_weights = None

        pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(**self.vectorizer_params)),
            ("classifier", RandomForestClassifier(
                n_estimators=200,  # Increased from 100
                max_depth=20,  # Added max_depth to prevent overfitting
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1,
                class_weight=class_weights,
                bootstrap=True
            ))
        ])

        # Define parameters for grid search
        param_grid = {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [10, 20, None],
            'classifier__min_samples_split': [2, 5],
            'tfidf__max_features': [5000, 10000],
            'tfidf__ngram_range': [(1, 2), (1, 3)]
        }

        return GridSearchCV(
            pipeline,
            param_grid,
            cv=5,
            n_jobs=-1,
            verbose=1,
            scoring='f1_weighted'
        )

    def analyze_class_distribution(self, df):
        """Analyze and print class distribution information"""
        print("\nClass Distribution Analysis:", file=sys.stderr)
        
        for column in ['category', 'sub_category']:
            counts = df[column].value_counts()
            print(f"\n{column.upper()} Distribution:", file=sys.stderr)
            print(f"Total unique classes: {len(counts)}", file=sys.stderr)
            print(f"Classes with only one sample: {sum(counts == 1)}", file=sys.stderr)
            print("\nTop 5 most common classes:", file=sys.stderr)
            print(counts.head().to_string(), file=sys.stderr)
            print("\nClasses with less than minimum samples:", file=sys.stderr)
            print(counts[counts < self.min_samples_per_class].to_string(), file=sys.stderr)
        
        return counts

    def filter_rare_classes(self, df):
        """Filter out classes with too few samples"""
        print("\nFiltering rare classes...", file=sys.stderr)
        
        original_len = len(df)
        
        # Filter both category and sub_category
        for column in ['category', 'sub_category']:
            counts = df[column].value_counts()
            valid_classes = counts[counts >= self.min_samples_per_class].index
            df = df[df[column].isin(valid_classes)]
        
        filtered_len = len(df)
        print(f"Removed {original_len - filtered_len} samples with rare classes", file=sys.stderr)
        print(f"Remaining samples: {filtered_len}", file=sys.stderr)
        
        if filtered_len == 0:
            raise ValueError("No samples remaining after filtering rare classes. Consider lowering min_samples_per_class.")
        
        return df

    def preprocess_text(self, text):
        """Clean and preprocess the text data"""
        try:
            # Convert to string if not already
            if not isinstance(text, str):
                text = str(text)

            # Convert to lowercase
            text = text.lower()

            # Remove special characters and numbers
            text = re.sub(r"[^a-zA-Z\s]", "", text)

            # Tokenize
            tokens = word_tokenize(text)

            # Remove stopwords and lemmatize
            tokens = [
                self.lemmatizer.lemmatize(token)
                for token in tokens
                if token not in self.stop_words and len(token) > 2
            ]

            return " ".join(tokens)

        except Exception as e:
            print(f"Error preprocessing text: {str(e)}", file=sys.stderr)
            return ""  # Return empty string in case of error

    def prepare_data(self, df):
        """Prepare the data for training with additional validation"""
        
        # Add Unknown category for handling unseen labels
        for column in ['category', 'sub_category']:
            if 'Unknown' not in df[column].unique():
                # Add a single example of Unknown category
                unknown_row = df.iloc[0].copy()
                unknown_row[column] = 'Unknown'
                unknown_row['crimeaditionalinfo'] = 'unknown case'
                df = pd.concat([df, pd.DataFrame([unknown_row])], ignore_index=True)
        
        # Analyze initial class distribution
        self.analyze_class_distribution(df)
        
        # Filter rare classes
        df = self.filter_rare_classes(df)
        
        # Analyze class distribution after filtering
        print("\nClass distribution after filtering:", file=sys.stderr)
        self.analyze_class_distribution(df)
        
        # Preprocess the text data
        print("\nPreprocessing text data...", file=sys.stderr)
        df['processed_text'] = [
            self.preprocess_text(text) 
            for text in tqdm(df['crimeaditionalinfo'], desc="Preprocessing Text")
        ]
        
        # Remove empty processed texts
        df = df[df['processed_text'].str.len() > 0]
        print(f"Samples after removing empty processed texts: {len(df)}", file=sys.stderr)
        
        # Encode labels
        print("\nEncoding labels...", file=sys.stderr)
        for column in ['category', 'sub_category']:
            df[f'{column}_encoded'] = self.label_encoders[column].fit_transform(df[column])
            print(f"Number of unique {column}s: {len(self.label_encoders[column].classes_)}", file=sys.stderr)
        
        return df

    def train(self, train_df: pd.DataFrame, test_df: pd.DataFrame = None) -> bool:
        """Enhanced training with cross-validation and error analysis"""
        try:
            # Prepare training data
            prepared_train_df = self.prepare_data(train_df)
            
            if len(prepared_train_df) < self.min_samples_per_class * 2:
                raise ValueError(f"Not enough samples for training. Need at least {self.min_samples_per_class * 2} samples.")
            
            # Split features for training
            X = prepared_train_df['processed_text']
            
            # If no test set provided, create one
            if test_df is None:
                # Stratified split to maintain class distribution
                X_train, X_val, y_train_dict, y_val_dict = {}, {}, {}, {}
                
                for column in ['category', 'sub_category']:
                    y = prepared_train_df[f'{column}_encoded']
                    X_train[column], X_val[column], y_train_dict[column], y_val_dict[column] = train_test_split(
                        X, y, test_size=0.2, random_state=42, stratify=y
                    )
            else:
                prepared_test_df = self.prepare_data(test_df)
                X_val = {
                    'category': prepared_test_df['processed_text'],
                    'sub_category': prepared_test_df['processed_text']
                }
                y_val_dict = {
                    'category': prepared_test_df['category_encoded'],
                    'sub_category': prepared_test_df['sub_category_encoded']
                }
                X_train = {'category': X, 'sub_category': X}
                y_train_dict = {
                    'category': prepared_train_df['category_encoded'],
                    'sub_category': prepared_train_df['sub_category_encoded']
                }

            # Train and evaluate models for each target
            for column in ['category', 'sub_category']:
                print(f"\nTraining {column} model...", file=sys.stderr)
                
                # Build and train the model with grid search
                self.models[column] = self.build_model(y_train_dict[column])
                self.models[column].fit(X_train[column], y_train_dict[column])
                
                # Print best parameters
                print(f"\nBest parameters for {column}:", file=sys.stderr)
                print(self.models[column].best_params_, file=sys.stderr)
                
                # Make predictions and evaluate
                y_pred = self.models[column].predict(X_val[column])
                
                # Print detailed evaluation metrics
                print(f"\n{column.upper()} Classification Report:", file=sys.stderr)
                print(classification_report(
                    y_val_dict[column],
                    y_pred,
                    target_names=self.label_encoders[column].classes_,
                    zero_division=1
                ))
                
                # Error analysis
                self._perform_error_analysis(
                    X_val[column],
                    y_val_dict[column],
                    y_pred,
                    self.label_encoders[column].classes_,
                    column
                )
            
            return True

        except Exception as e:
            logging.error(f"Error during training: {str(e)}")
            raise

    def _perform_error_analysis(self, X_val, y_true, y_pred, class_names, column):
        """Analyze prediction errors to identify patterns"""
        # Convert encoded labels back to original names
        y_true_names = [class_names[i] for i in y_true]
        y_pred_names = [class_names[i] for i in y_pred]
        
        # Find misclassified examples
        errors = [(true, pred, text) for true, pred, text in zip(y_true_names, y_pred_names, X_val) if true != pred]
        
        if errors:
            print(f"\nError Analysis for {column}:", file=sys.stderr)
            print(f"Total errors: {len(errors)}", file=sys.stderr)
            
            # Analyze common misclassifications
            misclass_pairs = [(true, pred) for true, pred, _ in errors]
            common_errors = pd.DataFrame(misclass_pairs, columns=['True', 'Predicted']).value_counts().head()
            
            print("\nMost common misclassifications:", file=sys.stderr)
            print(common_errors.to_string(), file=sys.stderr)

    def predict(self, text: str) -> Dict[str, Any]:
        """Enhanced prediction with confidence thresholds and error handling"""
        try:
            processed_text = self.preprocess_text(text)
            if not processed_text:
                return self._get_unknown_prediction()

            results = {}
            for column in ['category', 'sub_category']:
                if self.models[column] is None:
                    raise ValueError(f"Model for {column} is not trained")
                
                try:
                    # Get prediction probabilities
                    probas = self.models[column].predict_proba([processed_text])[0]
                    max_proba = max(probas)
                    prediction = self.models[column].predict([processed_text])[0]
                    
                    # Use confidence threshold
                    if max_proba < 0.3:  # Adjusted confidence threshold
                        results[column] = "Unknown"
                        results[f"{column}_confidence"] = 0.0
                    else:
                        results[column] = self.label_encoders[column].classes_[prediction]
                        results[f"{column}_confidence"] = float(max_proba)
                
                except Exception as e:
                    logging.error(f"Error predicting {column}: {str(e)}")
                    results.update(self._get_unknown_prediction(column))

            return results

        except Exception as e:
            logging.error(f"Error during prediction: {str(e)}")
            return self._get_unknown_prediction()

    def _get_unknown_prediction(self, column: str = None) -> Dict[str, Any]:
        """Helper method to return unknown prediction"""
        if column:
            return {
                column: "Unknown",
                f"{column}_confidence": 0.0
            }
        return {
            "category": "Unknown",
            "category_confidence": 0.0,
            "sub_category": "Unknown",
            "sub_category_confidence": 0.0
        }

In [None]:
import logging
import sys
import time
import json
from datetime import datetime
from pathlib import Path
from typing import Tuple, Dict, Any

import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import joblib

from cybercrime_classifier import CybercrimeClassifier

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

class DatasetAnalyzer:
    """Analyzes dataset characteristics and quality."""
    
    @staticmethod
    def analyze_text_length(df: pd.DataFrame) -> Dict[str, Any]:
        """Analyze text length distribution."""
        text_lengths = df['crimeaditionalinfo'].str.len()
        return {
            'min_length': int(text_lengths.min()),
            'max_length': int(text_lengths.max()),
            'mean_length': float(text_lengths.mean()),
            'median_length': float(text_lengths.median()),
            'std_length': float(text_lengths.std())
        }
    
    @staticmethod
    def analyze_class_distribution(df: pd.DataFrame) -> Dict[str, Dict[str, int]]:
        """Analyze class distribution for categories and sub-categories."""
        return {
            'category': df['category'].value_counts().to_dict(),
            'sub_category': df['sub_category'].value_counts().to_dict()
        }
    
    @staticmethod
    def analyze_class_overlap(train_df: pd.DataFrame, test_df: pd.DataFrame) -> Dict[str, Any]:
        """Analyze class overlap between train and test sets."""
        train_categories = set(train_df['category'].unique())
        test_categories = set(test_df['category'].unique())
        train_subcategories = set(train_df['sub_category'].unique())
        test_subcategories = set(test_df['sub_category'].unique())
        
        return {
            'category': {
                'train_only': list(train_categories - test_categories),
                'test_only': list(test_categories - train_categories),
                'common': list(train_categories & test_categories)
            },
            'sub_category': {
                'train_only': list(train_subcategories - test_subcategories),
                'test_only': list(test_subcategories - train_subcategories),
                'common': list(train_subcategories & test_subcategories)
            }
        }
    
    @staticmethod
    def detect_potential_issues(df: pd.DataFrame) -> Dict[str, Any]:
        """Detect potential data quality issues."""
        issues = {
            'duplicate_texts': int(df['crimeaditionalinfo'].duplicated().sum()),
            'very_short_texts': int((df['crimeaditionalinfo'].str.len() < 20).sum()),
            'very_long_texts': int((df['crimeaditionalinfo'].str.len() > 1000).sum()),
            'potential_noise': int((df['crimeaditionalinfo'].str.contains(r'[^\w\s.,!?@#$%&*()]')).sum())
        }
        return issues

    def prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepare and filter data, ensuring minimum samples per class."""
        try:
            prepared_df = df.copy()
            
            # Preprocess text
            prepared_df['processed_text'] = prepared_df['crimeaditionalinfo'].apply(self.preprocess_text)
            
            # Remove rows where preprocessing resulted in empty strings
            prepared_df = prepared_df[prepared_df['processed_text'].str.len() > 0]
            
            # Filter out classes with insufficient samples
            for column in ['category', 'sub_category']:
                # Get class counts
                class_counts = prepared_df[column].value_counts()
                
                # Get valid classes (those with enough samples)
                valid_classes = class_counts[class_counts >= self.min_samples_per_class].index
                
                if len(valid_classes) == 0:
                    raise ValueError(f"No classes have the minimum required {self.min_samples_per_class} samples")
                
                # Filter dataframe to keep only valid classes
                prepared_df = prepared_df[prepared_df[column].isin(valid_classes)]
                
                # Fit label encoder on valid classes
                self.label_encoders[column].fit(valid_classes)
                
                # Encode labels
                prepared_df[f'{column}_encoded'] = self.label_encoders[column].transform(prepared_df[column])
                
                # Log removed classes
                removed_classes = set(class_counts.index) - set(valid_classes)
                if removed_classes:
                    logging.warning(f"Removed {len(removed_classes)} {column} classes with fewer than "
                                f"{self.min_samples_per_class} samples: {removed_classes}")
            
            return prepared_df

        except Exception as e:
            logging.error(f"Error preparing data: {str(e)}")
            raise

def clean_dataframe(df: pd.DataFrame, is_training: bool = True) -> Tuple[pd.DataFrame, list]:
    """Enhanced data cleaning with quality checks and reporting."""
    df = df.copy()
    
    cleaning_steps = [
        ('Initial shape', lambda x: x, 'Initial dataset loaded'),
        ('Remove empty texts', lambda x: x[x['crimeaditionalinfo'].str.strip() != ''], 'Removed empty texts'),
        ('Fill missing values', lambda x: x.fillna({'category': 'Unknown', 'sub_category': 'Unknown', 'crimeaditionalinfo': ''}), 'Filled missing values'),
        ('Convert to string', lambda x: x.assign(crimeaditionalinfo=x['crimeaditionalinfo'].astype(str)), 'Converted text to string'),
        ('Clean special characters', lambda x: x.assign(
            crimeaditionalinfo=x['crimeaditionalinfo'].str.replace(r'[^\w\s.,!?@#$%&*()]', ' ', regex=True)
        ), 'Cleaned special characters')
    ]
    
    # Only remove duplicates and very short texts from training data
    if is_training:
        cleaning_steps.extend([
            ('Remove duplicates', lambda x: x.drop_duplicates(subset='crimeaditionalinfo'), 'Removed duplicate texts'),
            ('Remove very short texts', lambda x: x[x['crimeaditionalinfo'].str.len() >= 20], 'Removed very short texts')
        ])
    
    cleaning_report = []
    
    for step_name, step_func, step_desc in tqdm(cleaning_steps, desc="Cleaning Data"):
        initial_shape = df.shape[0]
        df = step_func(df)
        final_shape = df.shape[0]
        
        cleaning_report.append({
            'step': step_name,
            'description': step_desc,
            'rows_before': initial_shape,
            'rows_after': final_shape,
            'rows_removed': initial_shape - final_shape
        })
    
    return df, cleaning_report

def save_metrics(metrics: Dict[str, Any], filename: str):
    """Save metrics with proper formatting."""
    with open(filename, 'w') as f:
        json.dump(metrics, f, indent=2)

def train_and_save_model(
    train_path: str = 'data/train.csv',
    test_path: str = 'data/test.csv',
    min_samples_per_class: int = 5,
    output_dir: str = 'output'
) -> Tuple[str, str]:
    """Enhanced training pipeline with better error handling and class filtering."""
    try:
        start_time = time.time()
        logger.info("Starting enhanced training pipeline...")
        
        # Create output directory
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        # Load data
        logger.info("Loading datasets...")
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        # Analyze initial class distribution
        logger.info("Analyzing class distribution...")
        for column in ['category', 'sub_category']:
            class_counts = train_df[column].value_counts()
            logger.info(f"\n{column} distribution:")
            logger.info(f"Classes with 1 sample: {sum(class_counts == 1)}")
            logger.info(f"Classes with 2-4 samples: {sum((class_counts >= 2) & (class_counts < 5))}")
            logger.info(f"Classes with 5+ samples: {sum(class_counts >= 5)}")
        
        # Clean data
        train_df_cleaned, train_cleaning_report = clean_dataframe(train_df, is_training=True)
        test_df_cleaned, test_cleaning_report = clean_dataframe(test_df, is_training=False)
        
        # Initialize classifier with minimum samples requirement
        classifier = CybercrimeClassifier(min_samples_per_class=min_samples_per_class)
        
        # Prepare and filter data
        logger.info(f"Preparing data with minimum {min_samples_per_class} samples per class...")
        try:
            train_df_filtered = classifier.prepare_data(train_df_cleaned)
            test_df_filtered = classifier.prepare_data(test_df_cleaned)
            
            logger.info(f"Training data shape after filtering: {train_df_filtered.shape}")
            logger.info(f"Test data shape after filtering: {test_df_filtered.shape}")
            
            # Create validation set
            train_final, val_df = train_test_split(
                train_df_filtered,
                test_size=0.1,
                random_state=42
            )
            
            # Train model
            logger.info("Training model...")
            classifier.train(train_final, val_df)
            
            # Save model and metrics
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_filename = output_dir / f'cybercrime_classifier_{timestamp}.joblib'
            metrics_filename = output_dir / f'training_metrics_{timestamp}.json'
            
            joblib.dump(classifier, model_filename)
            
            # Save summary metrics
            metrics = {
                'timestamp': timestamp,
                'training_time': time.time() - start_time,
                'data_shapes': {
                    'initial_train': train_df.shape,
                    'filtered_train': train_df_filtered.shape,
                    'validation': val_df.shape,
                    'test': test_df_filtered.shape
                },
                'class_counts': {
                    'category': train_df_filtered['category'].value_counts().to_dict(),
                    'sub_category': train_df_filtered['sub_category'].value_counts().to_dict()
                }
            }
            
            with open(metrics_filename, 'w') as f:
                json.dump(metrics, f, indent=2)
            
            return str(model_filename), str(metrics_filename)
            
        except ValueError as ve:
            logger.error(f"Data preparation failed: {str(ve)}")
            raise
            
    except Exception as e:
        logger.error(f"Training failed: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        model_file, metrics_file = train_and_save_model()
        print("\nTraining completed successfully!")
        print(f"Model saved as: {model_file}")
        print(f"Metrics saved as: {metrics_file}")
        
        # Load and display key metrics
        with open(metrics_file, 'r') as f:
            metrics = json.load(f)
            print("\nKey Performance Metrics:")
            print(f"Training Time: {metrics['training_time']:.2f} seconds")
            print("\nModel Performance:")
            print(json.dumps(metrics['model_performance'], indent=2))
            
            # Print class overlap warnings if any
            overlap = metrics['data_analysis']['initial']['class_overlap']
            if overlap['category']['test_only'] or overlap['sub_category']['test_only']:
                print("\nWarning: Test set contains classes not seen during training:")
                if overlap['category']['test_only']:
                    print(f"Categories: {', '.join(overlap['category']['test_only'])}")
                if overlap['sub_category']['test_only']:
                    print(f"Sub-categories: {', '.join(overlap['sub_category']['test_only'])}")
    except Exception as e:
        print(f"Training failed: {str(e)}")
        sys.exit(1)