In [47]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np

import re, string

# Set GPU memory growth
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


from nltk.corpus import stopwords
stoplist = set(stopwords.words("english"))

In [22]:

# Load and preprocess the custom dataset
df = pd.read_csv("balanced_train_df.csv",  delimiter = " ")  # Replace with the path to your dataset file

# Split the DataFrame into training and validation sets
train_df, val_test_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.375, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text data
train_texts = train_df["Text"].tolist()
val_texts = val_df["Text"].tolist()
test_texts = test_df["Text"].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=18)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=18)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=18)

In [2]:
text_lengths = df["Text"].apply(lambda x: len(x.split()))
print(text_lengths.describe())

count    46840.000000
mean        11.815158
std          4.855269
min          2.000000
25%         10.000000
50%         12.000000
75%         14.000000
max        595.000000
Name: Text, dtype: float64


In [3]:
# Create TensorFlow datasets from the encoded data
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df["Sentiment"].tolist()
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_df["Sentiment"].tolist()
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_df["Sentiment"].tolist()
))

In [4]:
# Define the BERT model
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=2e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy("accuracy")]
)

In [6]:
# Train the model
model.fit(
    train_dataset.shuffle(1000).batch(64),
    validation_data=val_dataset.batch(64),
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26aa0d7afa0>

In [334]:
# #Save model
# model.save_pretrained(r"D:\Coding\Twitter_Sentimental_Analysis\My_trained_bert_model")

#Save tokenizer
tokenizer.save_pretrained(r"D:\Coding\Twitter_Sentimental_Analysis\My_trained_bert_model")

# #Save model for DockerAPI
# model.save(r"D:\Coding\Twitter_Sentimental_Analysis\Docker_sentiment_analysis")

('D:\\Coding\\Twitter_Sentimental_Analysis\\My_trained_bert_model\\tokenizer_config.json',
 'D:\\Coding\\Twitter_Sentimental_Analysis\\My_trained_bert_model\\special_tokens_map.json',
 'D:\\Coding\\Twitter_Sentimental_Analysis\\My_trained_bert_model\\vocab.txt',
 'D:\\Coding\\Twitter_Sentimental_Analysis\\My_trained_bert_model\\added_tokens.json')

In [343]:
tokenizer = BertTokenizer.from_pretrained(r"D:/Coding/Twitter_Sentimental_Analysis/token")

In [345]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

### TEST model

In [8]:
# Test the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_dataset.batch(64))

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Test Loss: 0.27277088165283203
Test Accuracy: 0.9375177621841431


### Test model on a single input

In [323]:
def preprocess_text(text):
    # Remove http / https links
    text = re.sub(r'http\S+|https\S+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stoplist)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [28]:
encoded_input

{'input_ids': [101, 2428, 2066, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [324]:
# Define the custom string input
input_text = "i don't think i REALLY LIKE YOU"

# Preprocess the input text
cleaned_text = preprocess_text(input_text)

# Tokenize the cleaned text
encoded_input = tokenizer(
    cleaned_text,
    max_length=18,
    padding="max_length",
    truncation=True,
    return_tensors='tf'
)


# Get the input tensor values
input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]
token_type_ids = encoded_input["token_type_ids"]

# Predict the sentiment label for the input
predictions = model.predict([input_ids, attention_mask, token_type_ids])
logits = predictions.logits[0]
probabilities = tf.nn.softmax(logits)
predicted_label = tf.argmax(probabilities).numpy()
confidence_level = np.max(probabilities)

# Map the predicted label to its corresponding sentiment category
sentiment_categories = ["Negative", "Positive"]
predicted_sentiment = sentiment_categories[predicted_label]

# Print the predicted sentiment and confidence level
print("Predicted sentiment:", predicted_sentiment)
print("Confidence level:", confidence_level)

Predicted sentiment: Negative
Confidence level: 0.9985921


In [327]:
tf.shape(
    encoded_input,
    out_type=tf.dtypes.int32,
    name=None
)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([3])>

In [310]:
type(encoded_input)

transformers.tokenization_utils_base.BatchEncoding

In [150]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Setting up RESTFul API

In [366]:
@tf.function
def preprocess_text(inputs):
    stoplist = ["where","yourselves","our","which","all","other","didn't","on","hasn't","under","in","re","shan't","you've","before","some","haven't","weren't","yours","be","own","don't","above","do","it","your","while","few","y","should've","am","does","he","there","but","her","ma","yourself","these","nor","hadn't","you","will","ain","by","doesn't","itself","if","mustn't","once","those","a","as","been","hadn","shouldn","that'll","each","or","most","again","d","just","any","is","into","you'll","ours","during","were","ll","won","herself","out","isn","she's","their","to","we","below","s","you'd","more","mightn","at","both","now","wasn't","doesn","against","then","are","the","they","from","of","themselves","should","them","t","myself","my","weren","no","can","was","theirs","needn't","further","had","himself","have","mustn","such","for","m","whom","it's","over","so","its","shouldn't","about","through","down","aren't","having","when","i","very","me","did","ourselves","who","you're","only","between","that","him","won't","and","didn","too","not","up","ve","why","aren","hasn","until","after","o","hers","because","being","an","needn","this","wouldn","has","how","don","wouldn't","here","with","off","couldn't","mightn't","shan","same","she","than","isn't","his","wasn","doing","haven","what","couldn"]
    stoplist = tf.constant(stoplist, dtype=tf.string)
    text = inputs

    # Remove http / https links
    text = tf.strings.regex_replace(text, r"http\S+|https\S+", "")
    # Convert to lowercase
    text = tf.strings.lower(text)
    print(text)
    # Remove punctuation
    text = tf.strings.regex_replace(text, "[%s]" % re.escape(string.punctuation), "")
    # Remove numbers
    text = tf.strings.regex_replace(text, r"\d+", "")
    # Remove extra whitespaces
    text = tf.strings.strip(text)
    text = tf.strings.regex_replace(text, rb"\s+", b" ")

    # Create a list of words from the text
    words = tf.strings.split(text)

    # Check if each word is not in the stoplist
    filtered_words = tf.ragged.boolean_mask(words, ~tf.reduce_any(tf.equal(words[:, tf.newaxis], stoplist), axis=-1))

    # Join the filtered words back into a single text
    filtered_text = tf.strings.reduce_join(filtered_words, separator=" ")

    return {"input_text": filtered_text}


@tf.function
def model_predict(input_text):
    exclaim = input_text
    print(exclaim)
    preprocessed_text = preprocess_text(exclaim)

    return_text = preprocessed_text["input_text"] #Has to be string for the next part

    encoded_input = tokenizer(
        tf.compat.as_str_any(return_text),
        max_length=18,
        padding="max_length",
        truncation=True,
        return_tensors='tf'
    )
    # Convert input_ids to a tensor
    # encoded_input = tf.convert_to_tensor(encoded_input)
    # # Reshape input_ids to match the expected shape
    # encoded_input = tf.reshape(encoded_input, (1, -1))

    # Get the input tensor values
    input_ids = encoded_input["input_ids"]
    attention_mask = encoded_input["attention_mask"]
    token_type_ids = encoded_input["token_type_ids"]
    
    # Predict the sentiment label for the input
    predictions = model([input_ids, attention_mask, token_type_ids])
    logits = predictions.logits[0]
    probabilities = tf.nn.softmax(logits)
    predicted_label = tf.argmax(probabilities)
    confidence_level = tf.reduce_max(probabilities)

    # Map the predicted label to its corresponding sentiment category
    sentiment_categories = tf.constant(["Negative", "Positive"])
    predicted_sentiment = tf.gather(sentiment_categories, predicted_label)

    return {
        "predicted_sentiment": predicted_sentiment,
        "confidence_level": confidence_level
    }

In [367]:
#Save model
save_dir = r"D:/Coding/Twitter_Sentimental_Analysis/Docker_inbuilt_analysis"

tf.saved_model.save(
    model,
    save_dir,
    signatures={
        "serving_default": model_predict.get_concrete_function(
           tf.TensorSpec(shape=[None], dtype=tf.string, name="input_text")
        )
    }
)

Tensor("input_text:0", shape=(None,), dtype=string)
Tensor("StringLower:0", shape=(None,), dtype=string)


KeyboardInterrupt: 

# Contacting API

In [322]:
import requests
import json

# Define the URL for the API endpoint
url = 'http://localhost:8501/v1/models/sentiment_analysis:predict'

# Prepare the input data
input_data = {
    'instances': [{'input_text': 'This is the first text'}]
}

response = requests.post(url, json=input_data)

# Parse the response
if response.status_code == 200:
    predictions = response.json()['predictions']
    print(predictions)
else:
    print('Error:', response.status_code, response.text)


Error: 400 {
    "error": "assertion failed: [Condition x == y did not hold element-wise:] [x (cond/RowPartitionFromUniformRowLength/control_dependency:0) = ] [0 179] [y (cond/RowPartitionFromRowLengths/concat:0) = ] [0 5]\n\t [[{{function_node cond_assert_equal_1_Assert_AssertGuard_false_438382}}{{node cond/assert_equal_1/Assert/AssertGuard/Assert}}]]"
}


In [250]:
def make_prediction(instances):
   data = json.dumps({"signature_name": "serving_default", "instances": instances})
   headers = {"content-type": "application/json"}
   json_response = requests.post(url, data=data, headers=headers)
   predictions = json.loads(json_response.text)
   return predictions

In [252]:
make_prediction(['I love you'])

{'error': 'instances is a plain list, but expecting list of objects as multiple input tensors required as per tensorinfo_map'}

In [229]:
response.json()

{'error': 'instances is a plain list, but expecting list of objects as multiple input tensors required as per tensorinfo_map'}