In [1]:
import os
from pathlib import Path
from ensure import ensure_annotations

In [2]:
%pwd

'd:\\CDAC\\Machine Learning\\sentiment-analyzer\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\CDAC\\Machine Learning\\sentiment-analyzer'

In [14]:
from dataclasses import dataclass

In [15]:
@dataclass(frozen=True)
class ModelTrainingConfig:
    model_dir: Path
    train_data_file: Path
    test_data_file: Path
    # Model Parameters
    C: float
    max_iter: int
    n_jobs: int
    penalty: str
    solver: str
    class_weight: str

In [16]:
from sentimentAnalyzer.constant import *
from sentimentAnalyzer.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(self, config_path = CONFIG_FILE_PATH, params_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_trainer
        parmas = self.params.model_params
        create_directories([config.model_dir])

        model_trianing_config = ModelTrainingConfig(
            model_dir = Path(config.model_dir),
            train_data_file = Path(config.train_data_file),
            test_data_file = Path(config.test_data_file),

            C = parmas.C,
            max_iter = parmas.max_iter,
            n_jobs = parmas.n_jobs,
            penalty = parmas.penalty,
            solver = parmas.solver,
            class_weight = parmas.class_weight
        )

        return model_trianing_config

In [18]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sentimentAnalyzer.utils.common import load_transformed_data_file, DataInfo
from sentimentAnalyzer.logging import logger

In [20]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config

    def train_model(self):
        X_train, y_train = load_transformed_data_file(path=self.config.train_data_file, 
                                               data_info=DataInfo.TRAINING)
        
        X_test, y_test = load_transformed_data_file(path=self.config.test_data_file,
                                                    data_info=DataInfo.TESTING)
        
        # Getting Model Parameters
        C = self.config.C
        max_iter = self.config.max_iter
        n_jobs = self.config.n_jobs
        penalty = self.config.penalty
        solver = self.config.solver
        class_weight = self.config.class_weight


        lrmodel = LogisticRegression(C = C, 
                                     max_iter=max_iter, 
                                     n_jobs=n_jobs, 
                                     penalty=penalty, 
                                     solver=solver, 
                                     class_weight=class_weight)
        # Model Training
        lrmodel.fit(X_train, y_train)
        logger.info("Model Training Completed")
        # Classification Report for now
        y_pred = lrmodel.predict(X_test)
        print(f">>>>>>> Classifiaction Report <<<<<<< \n{classification_report(y_test, y_pred)}")

        # Saving the Model
        if self.config.model_dir.exists():
            joblib.dump(lrmodel, f"{self.config.model_dir}/trained_model.pkl")
            logger.info(f'Trained Model is Saved in {self.config.model_dir}')
        else:
            logger.info("Directory not found and model not Saved")
            raise FileNotFoundError("Directory is Not Found")

In [21]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_training = ModelTraining(config=model_training_config)
    model_training.train_model()
except Exception as e:
    raise e

[2025-01-04 19:13:43,653: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-01-04 19:13:43,654: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-04 19:13:43,654: INFO: common: created directory at: artifacts]
[2025-01-04 19:13:43,654: INFO: common: created directory at: artifacts/train_model]
[2025-01-04 19:13:45,121: INFO: common: Input Transformed Training Data has been load from artifacts\data_transformation\train\X_train.npz]
[2025-01-04 19:13:45,148: INFO: common: Output Transformed Training Data is load from artifacts\data_transformation\train\y_train.npy]
[2025-01-04 19:13:45,693: INFO: common: Input Transformed Testing Data has been load from artifacts\data_transformation\test\X_test.npz]
[2025-01-04 19:13:45,721: INFO: common: Output Transformed Testing Data is load from artifacts\data_transformation\test\y_test.npy]
[2025-01-04 19:14:37,271: INFO: 174323616: Model Training Completed]
>>>>>>> Classifiaction Report <<<<<<< 
              p

In [39]:
import os
from enum import Enum
from box.exceptions import BoxValueError
import yaml
from sentimentAnalyzer.logging import logger
from ensure import ensure_annotations
from box import ConfigBox
from pathlib import Path
from typing import Any, Union
import pandas as pd
import numpy as np
from scipy.sparse import save_npz, load_npz, csr_matrix
from sentimentAnalyzer.utils.common import DataInfo

In [40]:
@ensure_annotations
def load_transformed_data_file(path: Path, data_info: DataInfo) -> tuple:
    """
    Load transformed input and output data from files in a directory.

    Args:
        path (Path): Path to the directory containing transformed data files.
        data_info (DataInfo): Metadata about whether this data is for training or testing.

    Returns:
        dict: A dictionary with keys "input" (csr_matrix) and "output" (np.ndarray).
              Example:
              {
                  "input": <csr_matrix>,
                  "output": <ndarray>
              }

    Raises:
        FileNotFoundError: If the directory does not contain any valid files.
        ValueError: If a file has an unsupported extension.

    Notes:
        - The function expects at least one `.npz` file (for input data) and one `.npy` file
          (for output data) in the provided directory.
        - If the directory is empty, an exception is raised.
        - Unsupported file extensions will result in a ValueError.
    """

    path_file_lst = list(path.glob("*"))
    result_dic = dict()

    if not path_file_lst:
        logger.info(f"{path_file_lst} does not exist")
        raise FileNotFoundError(f"File not found: {path_file_lst}")
    
    for file_path in path_file_lst:
        
        if file_path.suffix == ".npz":
            data = load_npz(file_path)
            result_dic["input"] = data
            logger.info(f"Input {data_info.value} has been load from {file_path}")
        elif file_path.suffix == ".npy":
            data = np.load(file_path)
            result_dic['output'] = data
            logger.info(f"Output {data_info.value} is load from {file_path}")
        else:
            logger.info(f"{file_path} file extension not supported")
            raise ValueError(f"Unsupported file extension: {file_path.suffix}")
    
    return result_dic.get('input'), result_dic.get('output')

In [42]:
a, b = load_transformed_data_file(Path("artifacts/data_transformation/train"), data_info=DataInfo.TRAINING)
c, d = load_transformed_data_file(Path("artifacts/data_transformation/test"), data_info=DataInfo.TESTING)


[2025-01-04 16:52:18,836: INFO: 2216929817: Input Transformed Training Data has been load from artifacts\data_transformation\train\X_train.npz]
[2025-01-04 16:52:18,836: INFO: 2216929817: Output Transformed Training Data is load from artifacts\data_transformation\train\y_train.npy]
[2025-01-04 16:52:19,074: INFO: 2216929817: Input Transformed Testing Data has been load from artifacts\data_transformation\test\X_test.npz]
[2025-01-04 16:52:19,076: INFO: 2216929817: Output Transformed Testing Data is load from artifacts\data_transformation\test\y_test.npy]


In [44]:
type(a)

scipy.sparse._csr.csr_matrix

In [45]:
type(b)

numpy.ndarray

In [55]:
X_train = a.get('input')
y_train = a.get('output')
X_test = b.get('input')
y_test = b.get('output')

In [56]:
def model_evaluation(model):
    y_pred = model.predict(X_test)
    print(f">>>>>>> Classifiaction Report <<<<<<< \n{classification_report(y_test, y_pred)}")
    cf_matrix = confusion_matrix(y_test, y_pred)
    print(f">>>>>>> Confusion Matrix <<<<<< \n {cf_matrix}")

In [57]:
lrmodel = LogisticRegression(C = 2, max_iter=1000, n_jobs=-1, penalty='l2', solver='saga', class_weight='balanced')
lrmodel.fit(X_train, y_train)
model_evaluation(lrmodel)

              precision    recall  f1-score   support

           0       0.80      0.78      0.79    159494
           1       0.79      0.81      0.80    160506

    accuracy                           0.80    320000
   macro avg       0.80      0.80      0.80    320000
weighted avg       0.80      0.80      0.80    320000

[[124162  35332]
 [ 30129 130377]]


In [51]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12996765 stored elements and shape (1280000, 300000)>

In [52]:
y_train

array([1, 1, 1, ..., 0, 0, 0], shape=(1280000,))

In [53]:
def model_evaluation(model):

    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))

    cf_matrix = confusion_matrix(y_test, y_pred)

    print(cf_matrix)

In [None]:
file_paths = list(path.glob("*"))

[WindowsPath('artifacts/data_transformation/train/X_train.npz'),
 WindowsPath('artifacts/data_transformation/train/y_train.npy')]

In [43]:
if file_paths:
    print(file_paths)
    raise FileNotFoundError(f"No files found in directory: {file_paths}")

[WindowsPath('artifacts/data_transformation/train/X_train.npz'), WindowsPath('artifacts/data_transformation/train/y_train.npy')]


FileNotFoundError: No files found in directory: [WindowsPath('artifacts/data_transformation/train/X_train.npz'), WindowsPath('artifacts/data_transformation/train/y_train.npy')]

In [45]:
a = load_transformed_data_file(Path("artifacts/data_transformation/train"), data_info=DataInfo.TRAINING)

AttributeError: 'list' object has no attribute 'exists'

In [None]:
print(a)

In [None]:
class DataIngestion:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config

    def train_model(self):
        pass