In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\sentiment-analysis'

---

# Model BaseLine

In [None]:
from dataclasses import dataclass
from pathlib import Path
import numpy as np
from typing import Dict, List, Tuple, Union, Optional, Any

@dataclass(frozen=True)
class ModelDevelopmentConfig:
    root_dir: Path
    data_files_path: Path


@dataclass
class TrainingResult:
    """Container for training results"""
    model_name: str
    metrics: Dict[str, float]
    predictions: np.ndarray
    parameters: Dict[str, Any]

In [None]:
from SentiScope.constants import (CONFIG_FILE_PATH,
                                  PARAMS_FILE_PATH)
from SentiScope.utils.file_utils import (create_directories,
                                            get_size)
from SentiScope.utils.config_utils import (read_yaml,
                                           Settings,
                                           get_settings)

In [None]:
import json
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_latest_report(self) -> Dict:
        """Locate the latest metadata.json file based on the timestamp folder."""
        config = self.config.feature_transformation
        profiling_dir = Path(config.root_dir)

        # Get all subdirectories in data_profiling
        timestamp_dirs = [d for d in profiling_dir.iterdir() if d.is_dir()]
        
        if not timestamp_dirs:
            raise FileNotFoundError("No timestamp folders found in feature_transformation.")

        # Sort directories by name (assuming timestamp format)
        latest_dir = sorted(timestamp_dirs, key=lambda x: x.name, reverse=True)[0]
        metadata_path = latest_dir / "metadata.json"

        if not metadata_path.exists():
            raise FileNotFoundError(f"metadata.json not found in {latest_dir}.")

        # Load the report.json file
        with open(metadata_path, "r") as f:
            report_data = json.load(f)

        return report_data

    def get_model_development_config(self) -> ModelDevelopmentConfig:
        config = self.config.model_development
        report_data = self.get_latest_report()

        create_directories([config.root_dir])

        timestamp = report_data["timestamp"]
        # data_file_path = Path(str(config.data_file).format(timestamp=timestamp))
        data_files_path = Path(config.data_files_path).joinpath(f"{timestamp}")
 

        model_development_config = ModelDevelopmentConfig(
            root_dir=config.root_dir,
            data_files_path=data_files_path,

        )

        return model_development_config

In [None]:
from abc import ABC, abstractmethod
from typing import Dict, Any
import numpy as np

class BaseModel(ABC):
    """Abstract base class defining the interface for all models"""
    
    @abstractmethod
    def train(self, X_train: Any, y_train: Any) -> None:
        """Train the model on the given data"""
        pass
    
    @abstractmethod
    def predict(self, X: Any) -> np.ndarray:
        """Make predictions on new data"""
        pass
    
    @abstractmethod
    def get_params(self) -> Dict[str, Any]:
        """Get the model's current parameters"""
        pass
    
    @abstractmethod
    def set_params(self, **params) -> None:
        """Set the model's parameters"""
        pass

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


class SklearnModelWrapper(BaseModel):
    """Wrapper for scikit-learn models to conform to our interface"""
    
    def __init__(self, model):
        self.model = model
    
    def train(self, X_train, y_train):
        return self.model.fit(X_train, y_train)
    
    def predict(self, X):
        return self.model.predict(X)
    
    def get_params(self):
        return self.model.get_params()
    
    def set_params(self, **params):
        return self.model.set_params(**params)

class LogisticRegressionModel(SklearnModelWrapper):
    """Logistic Regression implementation"""
    
    def __init__(self):
        super().__init__(LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))

class SVMModel(SklearnModelWrapper):
    """Support Vector Machine implementation"""
    
    def __init__(self):
        super().__init__(SVC(kernel='linear', probability=True))

In [None]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Any, List
from sklearn.metrics import classification_report
import numpy as np
from scipy.sparse import issparse
from scipy import sparse
import os
import json
from datetime import datetime
from SentiScope.logging import logger
from datetime import datetime


class TrainingManager:
    """Handles model training and evaluation"""
    
    def __init__(self, output_dir: Path):
        self.training_history: List[TrainingResult] = []
        self.output_dir = output_dir
        logger.info(f"Initialized TrainingManager with output directory: {output_dir}")
    
    def _validate_data_split(self, data_split):
        """Validate the data split dictionary"""
        logger.info("Validating data split...")
        
        if not isinstance(data_split, dict):
            msg = f"data_split must be a dictionary, got {type(data_split)}"
            logger.error(msg)
            raise TypeError(msg)
            
        required_keys = ["X_train", "X_test", "y_train", "y_test"]
        
        missing = [key for key in required_keys if key not in data_split]
        if missing:
            msg = f"Missing required keys in data_split: {missing}"
            logger.error(msg)
            raise ValueError(msg)
        
        n_train_samples = (data_split["X_train"].shape[0] if not issparse(data_split["X_train"]) 
                          else data_split["X_train"].shape[0])
        n_test_samples = (data_split["X_test"].shape[0] if not issparse(data_split["X_test"]) 
                         else data_split["X_test"].shape[0])
        
        if n_train_samples != len(data_split["y_train"]):
            msg = (f"Mismatch in training set dimensions: X_train has {n_train_samples} "
                  f"samples but y_train has {len(data_split['y_train'])} samples")
            logger.error(msg)
            raise ValueError(msg)
        
        if n_test_samples != len(data_split["y_test"]):
            msg = (f"Mismatch in test set dimensions: X_test has {n_test_samples} "
                  f"samples but y_test has {len(data_split['y_test'])} samples")
            logger.error(msg)
            raise ValueError(msg)
        
        logger.info("Data split validation completed successfully")
    
    def train_and_evaluate(self, model, model_name: str, data_split) -> TrainingResult:
        """Train a model and evaluate its performance"""
        try:
            logger.info(f"Starting training and evaluation for {model_name}")
            
            # Validate the data split
            self._validate_data_split(data_split)
            
            # Verify model has required methods
            required_methods = ['train', 'predict', 'get_params']
            for method in required_methods:
                if not hasattr(model, method):
                    msg = f"Model lacks required method: {method}"
                    logger.error(msg)
                    raise AttributeError(msg)
            
            # Train the model
            logger.info(f"Training {model_name}...")
            model.train(data_split["X_train"], data_split["y_train"])
            
            # Make predictions
            logger.info(f"Making predictions for {model_name}...")
            predictions = model.predict(data_split["X_test"])
            
            # Calculate metrics
            logger.info(f"Calculating metrics for {model_name}...")
            metrics = classification_report(
                data_split["y_test"], 
                predictions, 
                output_dict=True
            )
            
            # Create result object
            result = TrainingResult(
                model_name=model_name,
                metrics=metrics,
                predictions=predictions,
                parameters=model.get_params()
            )
            
            # Save results to output directory
            model_dir = self.output_dir / model_name
            model_dir.mkdir(exist_ok=True)
            
            # Save predictions
            np.save(model_dir / 'predictions.npy', predictions)
            
            # Save metrics and parameters
            with open(model_dir / 'results.json', 'w') as f:
                json.dump({
                    'metrics': metrics,
                    'parameters': model.get_params()
                }, f, indent=4)
            
            self.training_history.append(result)
            
            logger.info(f"Successfully completed training and evaluation for {model_name}")
            return result
            
        except Exception as e:
            logger.error(f"Error in train_and_evaluate for {model_name}: {str(e)}")
            raise


In [None]:

class SentimentPipeline:
    """Coordinates all components of the sentiment analysis system"""
    
    def __init__(self, config: ModelDevelopmentConfig):
        self.config = config
        self.data_files_path = self.config.data_files_path
        
        # Create timestamped output directory
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = Path(self.config.root_dir) / self.timestamp
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        self.training_manager = TrainingManager(self.output_dir)
        self.models = {
            'logistic_regression': LogisticRegressionModel()
        }
        
        logger.info(f"Initialized SentimentPipeline with output directory: {self.output_dir}")
    
    def prepare_data(self):
        """Load and prepare data for training"""
        try:
            logger.info("Loading prepared data from artifacts...")
            
            # Construct paths to the data files
            x_train_path = self.data_files_path / "X_train.npy"
            x_test_path = self.data_files_path / "X_test.npy"
            y_train_path = self.data_files_path / "y_train.npy"
            y_test_path = self.data_files_path / "y_test.npy"

            # Load the data
            X_train_npy = np.load(x_train_path, allow_pickle=True)
            X_test_npy = np.load(x_test_path, allow_pickle=True)
            y_train = np.load(y_train_path, allow_pickle=True)
            y_test = np.load(y_test_path, allow_pickle=True)
            
            # Convert to sparse matrices
            X_train_sparse = sparse.csr_matrix(X_train_npy.all())
            X_test_sparse = sparse.csr_matrix(X_test_npy.all())

            logger.info("Data successfully loaded and converted to sparse format")
            
            # Save data info to metadata
            metadata = {
                'timestamp': self.timestamp,
                'data_shapes': {
                    'X_train': X_train_sparse.shape,
                    'X_test': X_test_sparse.shape,
                    'y_train': y_train.shape,
                    'y_test': y_test.shape
                },
                'data_source': {
                    'X_train': str(x_train_path),
                    'X_test': str(x_test_path),
                    'y_train': str(y_train_path),
                    'y_test': str(y_test_path)
                }
            }
            
            with open(self.output_dir / 'metadata.json', 'w') as f:
                json.dump(metadata, f, indent=4)
            
            return X_train_sparse, X_test_sparse, y_train, y_test
            
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise
    
    def train_models(self):
        """Train all registered models"""
        try:
            logger.info("Starting model training pipeline")
            
            x_train, x_test, y_train, y_test = self.prepare_data()
            data_split = {
                "X_train": x_train,
                "X_test": x_test,
                "y_train": y_train,
                "y_test": y_test
            }
            
            results = {}
            for name, model in self.models.items():
                logger.info(f"Training model: {name}")
                results[name] = self.training_manager.train_and_evaluate(
                    model, name, data_split
                )
            
            # Save final summary
            summary = {
                'timestamp': self.timestamp,
                'models_trained': list(self.models.keys()),
                'results': {
                    name: {
                        'metrics': result.metrics,
                        'parameters': result.parameters
                    }
                    for name, result in results.items()
                }
            }
            
            with open(self.output_dir / 'training_summary.json', 'w') as f:
                json.dump(summary, f, indent=4)
            
            logger.info("Model training pipeline completed successfully")
            return results
            
        except Exception as e:
            logger.error(f"Error in train_models: {str(e)}")
            raise

In [None]:
config = ConfigurationManager()
model_development_config = config.get_model_development_config()
pipeline = SentimentPipeline(config=model_development_config)
results = pipeline.train_models()
print(results)

-----

# Expermenting with more advanced models (XGBoost)

In [None]:
import xgboost as xgb
class XGBoostModel(SklearnModelWrapper):
    """XGBoost implementation using sklearn API"""
    
    def __init__(self, num_classes=3, params=None):
        default_params = {
            'objective': 'multi:softmax',
            'num_class': num_classes,
            'eta': 0.3,
            'max_depth': 6,
            'eval_metric': 'merror'

        }
        if params:
            default_params.update(params)
        super().__init__(xgb.XGBClassifier(**default_params))

In [None]:
# Define XGBoost parameters
xgb_params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'merror'
}

In [None]:
config = ConfigurationManager()
model_development_config = config.get_model_development_config()
pipeline = SentimentPipeline(config=model_development_config)
pipeline.models['xgboost'] = XGBoostModel(num_classes=3, params=xgb_params)
results = pipeline.train_models()
print(results)

-----

# Advanced Modeling