In [1]:
%pwd

'c:\\Users\\44787\\Desktop\\projects\\end-to-end-SMS-Spam-classifier\\research'

In [2]:
import os

os.chdir('../')
%pwd

'c:\\Users\\44787\\Desktop\\projects\\end-to-end-SMS-Spam-classifier'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class ModelTrainingConfig:
    root_dir: Path
    df: Path

In [4]:
from src.SMSClassifier.constants import CONFIG_PATH
from src.SMSClassifier.utils.common import read_yaml, create_directories

In [5]:
class ConfiguratoinManager:
    def __init__(self, config=CONFIG_PATH):
        self.config = read_yaml(config)

    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_training
        create_directories([config.root_dir])

        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            df=config.df,
        )
        return model_training_config

In [6]:
from src.SMSClassifier.logging import logger
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from src.SMSClassifier.utils.common import save_bin
import joblib
import pandas as pd

In [7]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig) -> None:
        self.config = config

    def training_pipeline(self):
        df = pd.read_csv(self.config.df, encoding='latin-1')
        # print(df.head(2))

        # tfidf vectorizer
        tfv = TfidfVectorizer(max_features=2500)
        X = tfv.fit_transform(df['msg'].values.astype('U'))

        y = df.target

        # balance the dataset
        sampler = RandomOverSampler(random_state=42)
        X_sm, y_sm = sampler.fit_resample(X, y)

        # train test split
        X_train, X_test, y_train, y_test = train_test_split(
            X_sm, y_sm, test_size=0.20, random_state=42)

        # train the model
        model = MultinomialNB()
        model.fit(X_train, y_train)

        # model evaluation
        y_pred = model.predict(X_test)
        logger.info(confusion_matrix(y_test, y_pred))

        logger.info(classification_report(y_test, y_pred))

        # save the model
        # save_bin(model, os.path.join(self.config.root_dir, 'model.joblib'))
        joblib.dump(model, os.path.join(self.config.root_dir, 'model.joblib'))
        joblib.dump(model, os.path.join(self.config.root_dir, 'tfidfv.joblib'))
        

In [8]:
try:
    config = ConfiguratoinManager()
    model_training_config = config.get_model_training_config()
    model = ModelTraining(model_training_config)
    model.training_pipeline()
except Exception as e:
    raise e

[2024-01-31 16:46:48,558: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-31 16:46:48,561: INFO: common: created directory at: artifacts/model_training]
[2024-01-31 16:46:48,669: INFO: 2608674256: [[842  42]
 [ 38 885]]]
[2024-01-31 16:46:48,691: INFO: 2608674256:               precision    recall  f1-score   support

           0       0.96      0.95      0.95       884
           1       0.95      0.96      0.96       923

    accuracy                           0.96      1807
   macro avg       0.96      0.96      0.96      1807
weighted avg       0.96      0.96      0.96      1807
]
