In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\sentiment-analysis'

In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple, Union, Optional

@dataclass(frozen=True)
class ModelDevelopmentConfig:
    root_dir: Path
    data_files_path: Path


In [3]:
from SentiScope.constants import (CONFIG_FILE_PATH,
                                  PARAMS_FILE_PATH)
from SentiScope.utils.file_utils import (create_directories,
                                            get_size)
from SentiScope.utils.config_utils import (read_yaml,
                                           Settings,
                                           get_settings)

In [4]:
import json
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_latest_report(self) -> Dict:
        """Locate the latest metadata.json file based on the timestamp folder."""
        config = self.config.feature_transformation
        profiling_dir = Path(config.root_dir)

        # Get all subdirectories in data_profiling
        timestamp_dirs = [d for d in profiling_dir.iterdir() if d.is_dir()]
        
        if not timestamp_dirs:
            raise FileNotFoundError("No timestamp folders found in feature_transformation.")

        # Sort directories by name (assuming timestamp format)
        latest_dir = sorted(timestamp_dirs, key=lambda x: x.name, reverse=True)[0]
        metadata_path = latest_dir / "metadata.json"

        if not metadata_path.exists():
            raise FileNotFoundError(f"metadata.json not found in {latest_dir}.")

        # Load the report.json file
        with open(metadata_path, "r") as f:
            report_data = json.load(f)

        return report_data

    def get_model_development_config(self) -> ModelDevelopmentConfig:
        config = self.config.model_development
        report_data = self.get_latest_report()

        create_directories([config.root_dir])

        timestamp = report_data["timestamp"]
        # data_file_path = Path(str(config.data_file).format(timestamp=timestamp))
        data_files_path = Path(config.data_files_path).joinpath(f"{timestamp}")
 

        model_development_config = ModelDevelopmentConfig(
            root_dir=config.root_dir,
            data_files_path=data_files_path,

        )

        return model_development_config

### BaseModel

In [5]:
from abc import ABC, abstractmethod
from typing import Dict, Any
import numpy as np

class BaseModel(ABC):
    """Abstract base class defining the interface for all models"""
    
    @abstractmethod
    def train(self, X_train: Any, y_train: Any) -> None:
        """Train the model on the given data"""
        pass
    
    @abstractmethod
    def predict(self, X: Any) -> np.ndarray:
        """Make predictions on new data"""
        pass
    
    @abstractmethod
    def get_params(self) -> Dict[str, Any]:
        """Get the model's current parameters"""
        pass
    
    @abstractmethod
    def set_params(self, **params) -> None:
        """Set the model's parameters"""
        pass

### Mode Baseline

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

class SklearnModelWrapper(BaseModel):
    """Wrapper for scikit-learn models to conform to our interface"""
    
    def __init__(self, model):
        self.model = model
    
    def train(self, X_train, y_train):
        return self.model.fit(X_train, y_train)
    
    def predict(self, X):
        return self.model.predict(X)
    
    def get_params(self):
        return self.model.get_params()
    
    def set_params(self, **params):
        return self.model.set_params(**params)

class LogisticRegressionModel(SklearnModelWrapper):
    """Logistic Regression implementation"""
    
    def __init__(self):
        super().__init__(LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))

class SVMModel(SklearnModelWrapper):
    """Support Vector Machine implementation"""
    
    def __init__(self):
        super().__init__(SVC(kernel='linear', probability=True))

### Training Manager

In [7]:
from typing import Dict, Any, List
from dataclasses import dataclass
from sklearn.metrics import classification_report
import numpy as np
from scipy.sparse import issparse

@dataclass
class TrainingResult:
    """Container for training results"""
    model_name: str
    metrics: Dict[str, float]
    predictions: np.ndarray
    parameters: Dict[str, Any]

class TrainingManager:
    """Handles model training and evaluation"""
    
    def __init__(self):
        self.training_history: List[TrainingResult] = []
    
    def _validate_data_split(self, data_split):
        """Validate the data split dictionary"""
        if not isinstance(data_split, dict):
            raise TypeError(f"data_split must be a dictionary, got {type(data_split)}")
            
        required_keys = ["X_train", "X_test", "y_train", "y_test"]
        
        # Check all required keys exist
        missing = [key for key in required_keys if key not in data_split]
        if missing:
            raise ValueError(f"Missing required keys in data_split: {missing}")
        
        # Check matching dimensions
        n_train_samples = (data_split["X_train"].shape[0] if not issparse(data_split["X_train"]) 
                          else data_split["X_train"].shape[0])
        n_test_samples = (data_split["X_test"].shape[0] if not issparse(data_split["X_test"]) 
                         else data_split["X_test"].shape[0])
        
        if n_train_samples != len(data_split["y_train"]):
            raise ValueError(f"Mismatch in training set dimensions: "
                           f"X_train has {n_train_samples} samples but y_train has {len(data_split['y_train'])} samples")
        
        if n_test_samples != len(data_split["y_test"]):
            raise ValueError(f"Mismatch in test set dimensions: "
                           f"X_test has {n_test_samples} samples but y_test has {len(data_split['y_test'])} samples")
    
    def train_and_evaluate(self, model, model_name: str, 
                          data_split) -> TrainingResult:
        """
        Train a model and evaluate its performance
        
        Args:
            model: A model object with train, predict, and get_params methods
            model_name: String identifier for the model
            data_split: Dictionary containing X_train, X_test, y_train, y_test
            
        Returns:
            TrainingResult object containing metrics and predictions
        """
        print("Data split keys:", data_split.keys())
        print("Data split types:", {k: type(v) for k, v in data_split.items()})
        print("Data split shapes:", {
            k: v.shape if hasattr(v, 'shape') else len(v) if hasattr(v, '__len__') else None 
            for k, v in data_split.items()
    })
        try:
            # Validate the data split
            self._validate_data_split(data_split)
            
            # Verify model has required methods
            required_methods = ['train', 'predict', 'get_params']
            for method in required_methods:
                if not hasattr(model, method):
                    raise AttributeError(f"Model lacks required method: {method}")
            
            # Train the model
            print(f"Training {model_name}...")
            model.train(data_split["X_train"], data_split["y_train"])
            
            # Make predictions
            print(f"Making predictions for {model_name}...")
            predictions = model.predict(data_split["X_test"])
            
            # Calculate metrics
            print(f"Calculating metrics for {model_name}...")
            metrics = classification_report(
                data_split["y_test"], 
                predictions, 
                output_dict=True
            )
            
            # Create and store result
            result = TrainingResult(
                model_name=model_name,
                metrics=metrics,
                predictions=predictions,
                parameters=model.get_params()
            )
            self.training_history.append(result)
            
            print(f"Successfully completed training and evaluation for {model_name}")
            return result
            
        except Exception as e:
            print(f"Error in train_and_evaluate for {model_name}: {str(e)}")
            raise

In [8]:
from scipy import sparse
import numpy as np
import os

class SentimentPipeline:
    """Coordinates all components of the sentiment analysis system"""
    
    def __init__(self, config: ModelDevelopmentConfig):
        self.config = config
        self.data_files_path = self.config.data_files_path
        self.training_manager = TrainingManager()
        self.models = {
            'logistic_regression': LogisticRegressionModel()
        }
    
    def prepare_data(self,):
        """
        Load prepared data from artifacts and split it for training and testing.
        
        Args:
            test_size (float): Size of the test split. Not used here since the data is already split.
        
        Returns:
            tuple: x_train, x_test, y_train, y_test as NumPy arrays.
        """
        try:
            # Construct paths to the data files
            x_train_path = os.path.join(self.data_files_path, "X_train.npy")
            x_test_path = os.path.join(self.data_files_path, "X_test.npy")
            y_train_path = os.path.join(self.data_files_path, "y_train.npy")
            y_test_path = os.path.join(self.data_files_path, "y_test.npy")

            # Load the data
            X_train_npy = np.load(x_train_path, allow_pickle=True)
            X_test_npy = np.load(x_test_path, allow_pickle=True)
            y_train = np.load(y_train_path, allow_pickle=True)
            y_test = np.load(y_test_path, allow_pickle=True)
            
            # Convert the dense array to a sparse matrix (e.g., CSR format)
            X_train_sparce = sparse.csr_matrix(X_train_npy.all())
            X_test_sparce = sparse.csr_matrix(X_test_npy.all())

            print("Data successfully loaded.")
            return X_train_sparce, X_test_sparce, y_train, y_test
        except Exception as e:
            print(f"Error loading data: {e}")
            raise e
    
    def train_models(self):
        """Train all registered models"""
        results = {}
        x_train, x_test, y_train, y_test = self.prepare_data()
        data_split = {
            "X_train": x_train,
            "X_test": x_test,
            "y_train": y_train,
            "y_test": y_test
        }
        
        for name, model in self.models.items():
            results[name] = self.training_manager.train_and_evaluate(
                model, name, data_split
            )
        
        return results


In [9]:
config = ConfigurationManager()
model_development_config = config.get_model_development_config()
pipeline = SentimentPipeline(config=model_development_config)
results = pipeline.train_models()
print(results)

[2025-01-12 06:37:33,151: INFO: config_utils: yaml file: config\config.yaml loaded successfully]
[2025-01-12 06:37:33,153: INFO: config_utils: yaml file: params.yaml loaded successfully]
[2025-01-12 06:37:33,154: INFO: file_utils: created directory at: artifacts]
[2025-01-12 06:37:33,157: INFO: file_utils: created directory at: artifacts/model_development]
Data successfully loaded.
Data split keys: dict_keys(['X_train', 'X_test', 'y_train', 'y_test'])
Data split types: {'X_train': <class 'scipy.sparse._csr.csr_matrix'>, 'X_test': <class 'scipy.sparse._csr.csr_matrix'>, 'y_train': <class 'numpy.ndarray'>, 'y_test': <class 'numpy.ndarray'>}
Data split shapes: {'X_train': (175435, 5000), 'X_test': (43859, 5000), 'y_train': (175435,), 'y_test': (43859,)}
Training logistic_regression...




Making predictions for logistic_regression...
Calculating metrics for logistic_regression...
Successfully completed training and evaluation for logistic_regression
{'logistic_regression': TrainingResult(model_name='logistic_regression', metrics={'0': {'precision': 0.8866193144218467, 'recall': 0.9321860939746741, 'f1-score': 0.9088319088319088, 'support': 21559.0}, '1': {'precision': 0.8566114933978055, 'recall': 0.8223531512229959, 'f1-score': 0.8391328110766989, 'support': 11202.0}, '2': {'precision': 0.7506227246598965, 'recall': 0.7059830600108128, 'f1-score': 0.7276188707280832, 'support': 11098.0}, 'accuracy': 0.846895734056864, 'macro avg': {'precision': 0.8312845108265162, 'recall': 0.8201741017361609, 'f1-score': 0.8251945302122303, 'support': 43859.0}, 'weighted avg': {'precision': 0.844542710662312, 'recall': 0.846895734056864, 'f1-score': 0.8451762944784097, 'support': 43859.0}}, predictions=array([0, 2, 1, ..., 2, 0, 2]), parameters={'C': 1.0, 'class_weight': None, 'dual':