In [1]:
import pandas as pd
import codecs
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
from string import punctuation
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import speech_recognition as sr
import os
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from PIL import Image
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import easyocr

torch.backends.cudnn.deterministic = True

  torch.utils._pytree._register_pytree_node(


## Text Data

In [2]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [3]:
def pre_process(df):
    all_reviews=list()
    for text in df:
        text = text.lower()
        text = "".join([ch for ch in text if ch not in punctuation])
        text = " ".join([c for c in text.split() if c not in stop_words and c.isalpha()])
        all_reviews.append(text)
    all_text = " ".join(all_reviews)
    all_words = all_text.split()
    return all_reviews, all_words

In [4]:
def create_vocab(words):
    from collections import Counter 
    # Count all the words using Counter Method
    count_words = Counter(words)
    total_words=len(words)
    sorted_words=count_words.most_common(total_words)
    print("Top ten occuring words : ",sorted_words[:10])

    vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}
    print(len(vocab_to_int))

    return vocab_to_int

In [5]:
def encode_features(reviews, vocab):
    encoded_reviews=list()
    for review in reviews:
      encoded_review=list()
      for word in review.split():
        if word not in vocab.keys():
          #if word is not available in vocab_to_int put 0 in that place
          encoded_review.append(0)
        else:
          encoded_review.append(vocab[word])
      encoded_reviews.append(encoded_review)
    return encoded_reviews

In [6]:
def pad_features(encoded_reviews):
    sequence_length=150
    features=np.zeros((len(encoded_reviews), sequence_length), dtype=int)
    for i, review in enumerate(encoded_reviews):
        review_len=len(review)
        if (review_len<=sequence_length):
            zeros=list(np.zeros(sequence_length-review_len))
            new=zeros+review
        else:
            new=review[:sequence_length]
        features[i,:]=np.array(new)
    return features

In [7]:
with codecs.open('/Users/aakaash_kb/Developer/Final Project/Dataset/yelp_review_full_csv/train.csv', 'r', encoding='ISO-8859-1') as f:
    df_train = pd.read_csv(f)

df_train.columns = ["Label", "Text"]
df_train = df_train.loc[df_train.Label.isin([1, 3, 5])]
df = df_train.Text.tolist()

reviews, words = pre_process(df)
vocab = create_vocab(words)
vocab = dict(list(vocab.items())[:20000])
with codecs.open('/Users/aakaash_kb/Developer/Final Project/demo/reviews.csv', 'r', encoding='ISO-8859-1') as f:
    df_test = pd.read_csv(f)

df_test = df_test["reviews"][:50].tolist()

test_encode = encode_features(df_test, vocab)
test_features = pad_features(test_encode)

Top ten occuring words :  [('place', 218664), ('food', 214831), ('good', 202116), ('like', 177057), ('get', 161923), ('one', 157546), ('would', 146896), ('time', 144596), ('service', 139934), ('great', 134896)]
517775


In [8]:
class LSTM0(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers):    
        super().__init__()
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        
        #Embedding and LSTM layers
        self.embedding=nn.Embedding(vocab_size, embedding_dim)
        self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        
        #Linear and sigmoid layer
        self.fc1=nn.Linear(hidden_dim, output_size)
        
        
    def forward(self, x, hidden):
        batch_size=x.size()
        
        #Embedding and LSTM output
        x = x.long()
        embedd=self.embedding(x)
        lstm_out, hidden=self.lstm(embedd, hidden)
        
        #stack up the lstm output
        lstm_out=lstm_out.contiguous().view(-1, self.hidden_dim)
        
        #dropout and fully connected layers
        out=self.fc1(lstm_out)
        soft_out = out.reshape(list(batch_size)+[3])
        soft_out = soft_out[:, -1, :]
        return soft_out, hidden
    
    def init_hidden(self, batch_size, device):
        """Initialize Hidden STATE"""
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        
        return hidden

In [9]:
torch.manual_seed(42)
vocab_size = 20000+1 # +1 for the 0 padding
output_size = 3
embedding_dim = 400
hidden_dim = 256
n_layers = 2

model = LSTM0(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(model)

LSTM0(
  (embedding): Embedding(20001, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=256, out_features=3, bias=True)
)


In [10]:
model.load_state_dict(torch.load("/Users/aakaash_kb/Developer/Final Project/demo/updated_model/lstm_04.pth"))

<All keys matched successfully>

In [11]:
inp_features = torch.FloatTensor(test_features)
model.eval()
device="cpu"
batch_size = 50
h = model.init_hidden(batch_size, device)
sm = torch.nn.Softmax(dim=1)
with torch.inference_mode():
    h = tuple([each.data for each in h])
    pred, h = model(inp_features, h)
    res = sm(pred).argmax(dim=1).numpy()
    print(res)

[2 0 2 0 0 0 2 2 2 0 2 0 2 2 2 2 2 2 1 2 0 0 2 2 2 0 2 2 2 1 2 2 2 2 1 1 2
 2 2 2 0 2 2 0 2 2 2 1 1 2]


In [12]:
text_op = {"Negative Response":0,"Neutral Response":0,"Positive Response":0}
for i in list(res):
    if i == 0:
        text_op["Negative Response"] += 1
    elif i == 1:
        text_op["Neutral Response"] += 1
    else:
        text_op["Positive Response"] += 1
print(text_op)

{'Negative Response': 11, 'Neutral Response': 6, 'Positive Response': 33}


## Audio Data

In [13]:
def convert_audio_to_text(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)  # Read the entire audio file
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")


In [14]:
# Path to the directory containing audio files
directory_path = "/Users/aakaash_kb/Developer/Final Project/demo/audio data"

audio_semantics = []

audio_extensions = [".mp3", ".wav", ".ogg", ".flac"]  
for filename in os.listdir(directory_path):
    if any(filename.endswith(ext) for ext in audio_extensions):
        full_path = os.path.join(directory_path, filename)
        print(full_path)
        semantic = convert_audio_to_text(full_path)
        audio_semantics.append(semantic)

print(audio_semantics)

/Users/aakaash_kb/Developer/Final Project/demo/audio data/16.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/17.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/15.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/29.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/sgdfhasgfdhgasf.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/28.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/14.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/1714922670028ofscl0o4-voicemaker.in-speech.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/17149228163560yuy9pej-voicemaker.in-speech.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/10.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/11.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/1714922693198r57hc04-voicemaker.in-speech.wav
/Users/aakaash_kb/Developer/Final Project/demo/audio data/13.wav
/Users/aakaash_kb/Devel

In [15]:
len(audio_semantics)

50

In [16]:
audio_encode = encode_features(audio_semantics, vocab)
audio_features = pad_features(audio_encode)

In [17]:
inp_features = torch.FloatTensor(audio_features)
model.eval()
device="cpu"
batch_size = 50
h = model.init_hidden(batch_size, device)
sm = torch.nn.Softmax(dim=1)
with torch.inference_mode():
    h = tuple([each.data for each in h])
    pred, h = model(inp_features, h)
    ares = sm(pred).argmax(dim=1).numpy()
    print(ares)


[2 1 2 0 0 1 1 1 2 1 1 0 1 0 2 1 2 2 2 2 0 1 2 0 0 2 1 0 1 0 0 1 2 1 1 1 2
 2 2 1 1 2 2 2 1 2 2 0 2 2]


In [18]:
audio_op = {"Negative Response":0,"Neutral Response":0,"Positive Response":0}
for i in list(ares):
    if i == 0:
        audio_op["Negative Response"] += 1
    elif i == 1:
        audio_op["Neutral Response"] += 1
    else:
        audio_op["Positive Response"] += 1
print(audio_op)

{'Negative Response': 11, 'Neutral Response': 18, 'Positive Response': 21}


## Image

In [20]:
model_dir = "/Users/aakaash_kb/Developer/Final Project/caption_model"
caption_model = VisionEncoderDecoderModel.from_pretrained(model_dir)
cfeature_extractor = ViTFeatureExtractor.from_pretrained(model_dir)
ctokenizer = AutoTokenizer.from_pretrained(model_dir)
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
caption_model.to(device)
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}


  torch.utils._pytree._register_pytree_node(


In [21]:
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = cfeature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = caption_model.generate(pixel_values, **gen_kwargs)

  preds = ctokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [22]:
directory_path = "/Users/aakaash_kb/Developer/Final Project/demo/image data"

img_paths = []
ocrs = []
img_extensions = [".jpeg", ".jpg", ".png"]  
for filename in os.listdir(directory_path):
    if any(filename.endswith(ext) for ext in img_extensions):
        full_path = os.path.join(directory_path, filename)
        reader = easyocr.Reader(['en'])
        result = reader.readtext(full_path)
        if len(result) != 0:
            ocrs.append(result[0][1])
        else:
            ocrs.append("")
        img_paths.append(full_path)
captions = predict_step(img_paths)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [23]:
image_semantics = []
for i in range(len(captions)):
    iop = captions[i] + ". " + ocrs[i]
    image_semantics.append(iop)

In [24]:
image_encode = encode_features(image_semantics, vocab)
image_features = pad_features(image_encode)

In [25]:
inp_features = torch.FloatTensor(image_features)
model.eval()
device="cpu"
batch_size = 50
h = model.init_hidden(batch_size, device)
sm = torch.nn.Softmax(dim=1)
with torch.inference_mode():
    h = tuple([each.data for each in h])
    pred, h = model(inp_features, h)
    ires = sm(pred).argmax(dim=1).numpy()
    print(ires)


[2 0 2 1 1 2 2 2 1 0 2 2 2 2 0 0 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 2 2 2 2 0
 2 2 1 0 2 0 2 0 1 0 2 1 1]


In [26]:
image_op = {"Negative Response":0,"Neutral Response":0,"Positive Response":0}
for i in list(ires):
    if i == 0:
        image_op["Negative Response"] += 1
    elif i == 1:
        image_op["Neutral Response"] += 1
    else:
        image_op["Positive Response"] += 1
print(image_op)

{'Negative Response': 20, 'Neutral Response': 7, 'Positive Response': 23}


## Ensembled Final Result

In [27]:
def normalize_dict(dictionary):
    total = sum(dictionary.values())
    normalized_dict = {key: value / total for key, value in dictionary.items()}
    return normalized_dict


In [28]:
def ensemble_dicts(text_op, audio_op, image_op, ratios):
    normalized_dict_text = normalize_dict(text_op)
    normalized_dict_audio = normalize_dict(audio_op)
    normalized_dict_image = normalize_dict(image_op)
    
    weighted_dict_text = {key: normalized_dict_text[key] * ratios[0] for key in normalized_dict_text}
    weighted_dict_audio = {key: normalized_dict_audio[key] * ratios[1] for key in normalized_dict_audio}
    weighted_dict_image = {key: normalized_dict_image[key] * ratios[2] for key in normalized_dict_image}
    
    ensemble_output = {}
    for key in weighted_dict_text:
        ensemble_output[key] = weighted_dict_text[key] + weighted_dict_audio[key] + weighted_dict_image[key]
    
    total_sum = sum(ensemble_output.values())
    ensemble_output_percentage = {key: round((value / total_sum) * 100, 2) for key, value in ensemble_output.items()}
    
    return ensemble_output_percentage

ratios = [3, 1, 2]

ensemble_op = ensemble_dicts(text_op, audio_op, image_op, ratios)


In [29]:
def repr(d):
    for i in d:
        print(i+" : "+str(d[i])+"%", end="  |  ")
    print()

In [30]:
print("-"*90)
print("\nTextual Data Summary:")
repr(text_op)
print("-"*90)
print("\nAudio Data Summary:")
repr(audio_op)
print("-"*90)
print("\nImage Data Summary:")
repr(image_op)
print("-"*90)
print("\nFinal Report:")
repr(ensemble_op)
print("-"*90)


------------------------------------------------------------------------------------------

Textual Data Summary:
Negative Response : 11%  |  Neutral Response : 6%  |  Positive Response : 33%  |  
------------------------------------------------------------------------------------------

Audio Data Summary:
Negative Response : 11%  |  Neutral Response : 18%  |  Positive Response : 21%  |  
------------------------------------------------------------------------------------------

Image Data Summary:
Negative Response : 20%  |  Neutral Response : 7%  |  Positive Response : 23%  |  
------------------------------------------------------------------------------------------

Final Report:
Negative Response : 28.0%  |  Neutral Response : 16.67%  |  Positive Response : 55.33%  |  
------------------------------------------------------------------------------------------
