In [2]:
%pwd

'c:\\Users\\akish.pothuri\\python\\SentimentAnalysis\\SentimentAPI\\research'

In [3]:
import os
os.chdir("../")
%pwd

'c:\\Users\\akish.pothuri\\python\\SentimentAnalysis\\SentimentAPI'

In [4]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import pad_sequences

In [11]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    vocab_size: int
    oov_tok: str
    embedding_dim: int
    max_length: int # choose based on statistics, for example 150 to 200
    padding_type: str
    trunc_type: str
    units: int #units: The number of hidden units in the layer.
    hidden_dense: str
    last_dense: str
    loss: str
    optimizer: str
    metrics: str
    num_epochs: int
    verbose: int
    validation_split: int
    last_layer: int
    dense_layers: int

In [12]:
from Sentiment.constants import *
from Sentiment.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            model_ckpt = config.model_ckpt,
            vocab_size = params.vocab_size,
            oov_tok = params.oov_tok,
            embedding_dim = params.embedding_dim,
            max_length = params.max_length, # choose based on statistics, for example 150 to 200
            padding_type =  params.padding_type,
            trunc_type = params.trunc_type,
            units = params.units, #units: The number of hidden units in the layer.
            hidden_dense = params.hidden_dense,
            last_dense = params.last_dense,
            loss = params.loss,
            optimizer = params.optimizer,
            metrics = params.metrics,
            num_epochs = params.num_epochs,
            verbose = params.verbose,
            validation_split = params.validation_split,
            dense_layers = params.dense_layers,
            last_layer = params.last_layer
        )

        return model_trainer_config

In [14]:
from sklearn.model_selection import train_test_split
import keras
from Sentiment import logger

In [15]:
# data = pd.read_excel('artifacts\data_transformation\sentimentDataset\TransformedData.xlsx')
# data.columns

In [16]:
# data['sentiment'].isnull().sum()

In [17]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
    
    def loadData(self):
        sentiData = pd.read_excel(self.config.data_path)
        sentiData.drop('Unnamed: 0',axis=1,inplace=True)
        sentiData.dropna(how="any", inplace=True)
        return sentiData
    
    def convertToLower(self,data):
        '''
        Converting data to the lowercase
        '''
        print(data.head())
        data.loc[:, 'Review'] = data.loc[:, 'Review'].str.lower()
        logger.info(f"Data converted to lowercase")
        return data

    def splitData(self,sentiData):
        '''
        Splitting the data into train and test.
        '''
        trainSentences, testSentences, trainLabels, testLabels = train_test_split(sentiData['Review'],sentiData['sentiment'] , stratify = sentiData['sentiment'])
        return trainSentences, testSentences, trainLabels, testLabels
    
    def TextToNumeric(self,trainSentences,testSentences):
        '''
        Converting the text to numeric
        '''
        tokenizer = Tokenizer(num_words = self.config.vocab_size, oov_token=self.config.oov_tok)
        tokenizer.fit_on_texts(trainSentences)
        word_index = tokenizer.word_index
        # convert train dataset to sequence and pad sequences
        trainSequences = tokenizer.texts_to_sequences(trainSentences)
        trainPadded = pad_sequences(trainSequences, padding=self.config.padding_type, maxlen=self.config.max_length)
        # convert Test dataset to sequence and pad sequences
        testSequences = tokenizer.texts_to_sequences(testSentences)
        testPadded = pad_sequences(testSequences, padding=self.config.padding_type, maxlen=self.config.max_length)
        return trainPadded,testPadded
    
    def prepareModel(self):
        '''
        model preparation
        '''
        # model initialization
        model = keras.Sequential([
            keras.layers.Embedding(self.config.vocab_size, self.config.embedding_dim, input_length=self.config.max_length),
            keras.layers.Bidirectional(keras.layers.LSTM(self.config.units)),
            keras.layers.Dense(self.config.dense_layers, activation=self.config.hidden_dense),
            keras.layers.Dense(self.config.last_layer, activation=self.config.last_dense)
        ])
        # compile model
        model.compile(loss=self.config.loss,
                    optimizer=self.config.optimizer,
                    metrics=[self.config.metrics])
        logger.info(model.summary())
        return model
    
    def trainModel(self,model,trainPadded,trainLabels):
            model.fit(trainPadded, trainLabels,epochs=self.config.num_epochs, verbose=self.config.verbose,validation_split=self.config.validation_split)
            return model
    
    def saveModel(self,model) -> bool:
         model.save(os.path.join(self.config.root_dir,"LSTMModel"+"V1"+".h5"))
         model.save(os.path.join(self.config.model_ckpt,"LSTMModel"+"V1"+".h5"))
         return True

In [18]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    logger.info(f"data loading Started")
    sentiData = model_trainer_config.loadData()
    logger.info(f"data loading Completed")
    logger.info(f"data Lowercase convertion Started")
    sentiData = model_trainer_config.convertToLower(sentiData)
    logger.info(f"data Lowercase convertion Completed")
    logger.info(f"data Splitting Started")
    trainSentences, testSentences, trainLabels, testLabels = model_trainer_config.splitData(sentiData)
    logger.info(f"data Splitting Completed")
    logger.info(f"data TextToNumeric Started")
    trainPadded,testPadded = model_trainer_config.TextToNumeric(trainSentences,testSentences)
    logger.info(f"data TextToNumeric Completed")
    logger.info(f"data prepareModel Started")
    modelArchitectute = model_trainer_config.prepareModel()
    logger.info(f"data prepareModel Completed")
    logger.info(f"data Model Training Started")
    trainedModel = model_trainer_config.trainModel(modelArchitectute,trainPadded,trainLabels)
    logger.info(f"data Model Training Completed")
    model_trainer_config.saveModel(trainedModel)
    logger.info(f"model saved successfully")
except Exception as e:
    raise e

[2024-01-04 00:48:55,933: INFO: common: yaml file: {yaml_file}]
[2024-01-04 00:48:55,938: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-04 00:48:55,939: INFO: common: yaml file: {yaml_file}]
[2024-01-04 00:48:55,944: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-04 00:48:55,946: INFO: common: created directory at: artifacts]
[2024-01-04 00:48:55,948: INFO: common: created directory at: artifacts/model_trainer]
[2024-01-04 00:48:55,949: INFO: 113507594: data loading Started]
[2024-01-04 00:49:13,899: INFO: 113507594: data loading Completed]
[2024-01-04 00:49:13,900: INFO: 113507594: data Lowercase convertion Started]
                                              Review  sentiment
0  Food ok. They good Italian sandwiches. Takes l...          1
1  My husband I tried Le Peep twice now. The firs...          1
2  This place decent coffee. It's right next movi...          1
3  Service great displeased meal. The long line l...          0
4  I've

In [None]:
import os
import json
import gzip
import nltk
import pandas as pd
from urllib.request import urlopen
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import keras
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import tensorflow as tf
from tensorflow import keras as ks
import time

In [None]:
def Tokens(plaintext):
        text = []
        # print(plaintext)
        text.append(plaintext)
        # print(text)
        vocab_size = 3000 # choose based on statistics
        oov_tok = ''
        max_length = 200
        # text = ["This is good i have ever ate"]
        tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
        tokenizer.fit_on_texts(text)
        sequences = tokenizer.texts_to_sequences(text)
        # print("sequences",sequences)
        # pad the sequence
        padded = pad_sequences(sequences, padding='post', maxlen=max_length)
        # print(padded)
        return padded