In [13]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentation: bool
    params_image_size: list
    params_data_size: int
    trained_stopwords_path:Path
    trained_vectorizer_path:Path
    trained_stemmer_path: Path


In [14]:
from sentiment_analysis.constants import *
from sentiment_analysis.utils.common import read_yaml, create_directories

import tensorflow as tf


In [16]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

        

    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        # training_data = os.path.join(self.config.data_ingestion.unzip_dir, "Chest--Scan-data")
        training_data = os.path.join(self.config.data_ingestion.unzip_dir, "Reviews.csv")
        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            trained_stopwords_path=Path(training.trained_stop_words_path),
            trained_vectorizer_path = Path(training.trained_vectorizer_path),
            trained_stemmer_path=Path(training.trained_stemmer_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(training_data),
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_is_augmentation=params.AUGMENTATION,
            params_image_size=params.IMAGE_SIZE,
            params_data_size=params.DATA_SIZE

        )

        return training_config

In [17]:
import os
import urllib.request as request
from zipfile import ZipFile
import tensorflow as tf
import time

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
plt.style.use('ggplot')
nltk.download('stopwords')
print(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhisekh.agarwala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhisekh.agarwala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    
    def get_base_model(self):
        # self.model = tf.keras.models.load_model(
        #     self.config.updated_base_model_path
        # )
        self.model = LogisticRegression()

    def train_valid_generator(self):

        # datagenerator_kwargs = dict(
        #     rescale = 1./255,
        #     validation_split=0.20
        # )

        # dataflow_kwargs = dict(
        #     target_size=self.config.params_image_size[:-1],
        #     batch_size=self.config.params_batch_size,
        #     interpolation="bilinear"
        # )

        # valid_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator(
        #     **datagenerator_kwargs
        # )

        # self.valid_generator = valid_datagenerator.flow_from_directory(
        #     directory=self.config.training_data,
        #     subset="validation",
        #     shuffle=False,
        #     **dataflow_kwargs
        # )

        # if self.config.params_is_augmentation:
        #     train_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator(
        #         rotation_range=40,
        #         horizontal_flip=True,
        #         width_shift_range=0.2,
        #         height_shift_range=0.2,
        #         shear_range=0.2,
        #         zoom_range=0.2,
        #         **datagenerator_kwargs
        #     )
        # else:
        #     train_datagenerator = valid_datagenerator

        # self.train_generator = train_datagenerator.flow_from_directory(
        #     directory=self.config.training_data,
        #     subset="training",
        #     shuffle=True,
        #     **dataflow_kwargs
        # )
        pass

    def training_data_preparation(self):
        df = pd.read_csv(self.config.training_data)
        df = df[['Text', 'Score']]
        df = df.sample(n=self.config.params_data_size)
        df = df.loc[df['Score']!=3]
        df = df.loc[df['Score']!=4]
        def category(score):
            return 0 if score==1 or score==2 else 1
        df['Sentiment']= df['Score'].apply(category)
        self.stop_words = stopwords.words('english')
        self.stemmer = PorterStemmer()
        def text_preprocessing(text):
            lower_casing = text.lower()
            tokens = word_tokenize(lower_casing)
            tokens = [self.stemmer.stem(token) for token in tokens if token not in self.stop_words and token not in string.punctuation]
            return " ".join(tokens)
        
        df['Text'] = df['Text'].apply(text_preprocessing)
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2)

        self.vectorizer = TfidfVectorizer()

        self.X_train_vect = self.vectorizer.fit_transform(self.X_train)
        
        self.X_test_vect = self.vectorizer.transform(self.X_test)
    
    @staticmethod
    def save_model(path: Path, model: tf.keras.Model):
        # model.save(path)
        pass
    
    @staticmethod
    def save_artifacts(obj, path: Path):
        with open(path,'wb') as f:
            pickle.dump(obj,f)


    
    def train(self):

        # self.steps_per_epoch = self.train_generator.samples // self.train_generator.batch_size
        # self.validation_steps = self.valid_generator.samples // self.valid_generator.batch_size

        # self.model.fit(
        #     self.train_generator,
        #     epochs=self.config.params_epochs,
        #     steps_per_epoch=self.steps_per_epoch,
        #     validation_steps=self.validation_steps,
        #     validation_data=self.valid_generator
        # )

        # self.save_model(
        #     path=self.config.trained_model_path,
        #     model=self.model
        # )

        self.model.fit(self.X_train_vect, self.y_train)
        predictions = self.model.predict(self.X_test_vect)
        accuracy_score = accuracy_score(self.y_test, predictions)
        confusion_metrix = confusion_matrix(y_test, y_pred)
        logger.info(f"model accuracy {accuracy_score}")
        logger.infor(f"model confusion matrix{confusion_metrix}")
        self.save_artifacts(self.model, self.config.trained_model_path)
        self.save_artifacts(self.stop_words, self.config.trained_stopwords_path)
        self.save_artifacts(self.stemmer, self.config.trained_stemmer_path)
        self.save_artifacts(self.vectorizer, self.config.trained_vectorizer_path)




In [28]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.get_base_model()
    # training.train_valid_generator()
    training.training_data_preparation()
    training.train()
    
except Exception as e:
    raise e

2024-08-06 23:02:59,052: Sentiment-Analysis: INFO: common.py: read_yaml:- yaml file: config/config.yaml loaded successfully
2024-08-06 23:02:59,053: Sentiment-Analysis: INFO: common.py: read_yaml:- yaml file: params.yaml loaded successfully
2024-08-06 23:02:59,054: Sentiment-Analysis: INFO: common.py: create_directories:- created directory at: artifacts
2024-08-06 23:02:59,054: Sentiment-Analysis: INFO: common.py: create_directories:- created directory at: artifacts/training


AttributeError: 'LogisticRegression' object has no attribute 'accuracy_score'