These are the final models we will be using on our algorithm. 

In [7]:
import re
import spacy
from spacy.training.example import Example
from spacy.tokens import DocBin
from tqdm import tqdm
import json
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
import io
import numpy as np

Preprocess is used to normalize the input and remove any undesirable character

In [2]:
def preprocess(text):
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuations and numbers
    #text = re.sub('[^a-zA-Z]', '', text)
    #text = re.sub('``', '', text)

    return text

NER Module: 

In [3]:
def train_and_load_ner_model(training_data_path="training_NER.json", model_output_path="model-best"):
    # Loading a new SpaCy model
    nlp = spacy.blank("en")

    # Create a DocBin object
    db = DocBin()

    # Opening and loading the training data JSON file
    with open(training_data_path) as f:
        TRAIN_DATA = json.load(f)

    for text, annot in tqdm(TRAIN_DATA['annotations']):
        # Creating a SpaCy Doc object from the input text.
        doc = nlp.make_doc(text)

        # Creating SpaCy objects for each entity
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                # Skipping entity if span is empty 
                print("Skipping entity")
            else:
                # Adding the span to the list of entities
                ents.append(span)
        doc.ents = ents 
        db.add(doc)

    # Save the DocBin object
    db.to_disk("./training_data.spacy")

    # Train the model
    ! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

    # Load the trained NER model
    nlp_ner = spacy.load(model_output_path)

    return nlp_ner


# Example usage:
# Train the model
#train_ner_model("your_training_data.json", "model-best")

# Load the trained model
#ner_model = load_ner_model("model-best")

def extract_entities_as_dict (input_text, nlp_model):
    # Process the input text
    doc = nlp_model(input_text)
    # Create a dictionary from the entities
    entity_dict = {}
    for ent in doc.ents:
        entity_dict[ent.text] = ent.label_

    return entity_dict

Sentiment Analysis Module

In [9]:
def train_sentiment_model(data_path="Restaurant_Reviews.csv"):
    df = pd.read_csv(data_path, encoding='ISO-8859-1')
    df['Review'] = df['Review'].apply(preprocess)

    X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Liked'], test_size=0.20, random_state=42)

    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

    def get_sentence_embedding(sentences):
        preprocessed_text = bert_preprocess(sentences)
        return bert_encoder(preprocessed_text)['pooled_output']

    # Define the model
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)

    reshaped_output = tf.keras.layers.Reshape((-1, outputs['pooled_output'].shape[-1]))(outputs['pooled_output'])
    lstm_output = tf.keras.layers.LSTM(64, name='lstm')(reshaped_output)
    dropout = tf.keras.layers.Dropout(0.1, name="dropout")(lstm_output)
    output = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(dropout)

    model = tf.keras.Model(inputs=[text_input], outputs=[output])

    METRICS = [
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                  loss='binary_crossentropy',
                  metrics=METRICS)

    model.fit(X_train, y_train, epochs=10)
    model.evaluate(X_test, y_test)
    y_predicted = model.predict(X_test)
    y_predicted = y_predicted.flatten()
    y_pred = np.where(y_predicted>=0.5, 1 ,0)               

    return model




In [12]:
def predict_sentiment(reviews, trained_model):
    # Suppress output during prediction
    original_stdout = sys.stdout
    sys.stdout = io.StringIO()

    predictions = trained_model.predict(reviews)

    # Reset standard output
    sys.stdout = original_stdout

    binary_predictions = np.where(predictions <= 0.5, 0, 1)
    return binary_predictions
