### Import libraries

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np

import re, string

# Set GPU memory growth
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

#Import stopwords library
from nltk.corpus import stopwords
stoplist = set(stopwords.words("english"))

# Load the BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Libraries for api
import requests
import json
import re
import string

### Load the dataset for processing

In [4]:
#Load dataset saved in csv file
df = pd.read_csv("balanced_train_df.csv", delimiter = " ")

#Split the DF into training and validation sets
train_df, val_test_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.375, random_state=42)

#Tokenize all text data

#Convert 'text' column to list and tokenize
train_texts = train_df["Text"].tolist()
val_texts = val_df["Text"].tolist()
test_texts = test_df["Text"].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=18)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=18)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=18)

In [5]:
#get a description of length of texts
text_lengths = df['Text'].apply(lambda x: len(x.split()))
print(text_lengths.describe())

count    46840.000000
mean        11.815158
std          4.855269
min          2.000000
25%         10.000000
50%         12.000000
75%         14.000000
max        595.000000
Name: Text, dtype: float64


### Create Tensorflow datasets from encoded data and its sentiment value

In [18]:
# Create train TF dataset from encoded data
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df['Sentiment'].tolist()
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_df["Sentiment"].tolist()
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_df["Sentiment"].tolist()
))

### Define model to train and compile it

In [None]:
# # Download model online from huggingface
# bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# bert_model.compile(
#     optimizer=keras.optimizers.Adam(learning_rate=2e-5),
#     loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#     metrics=[keras.metrics.SparseCategoricalAccuracy("accuracy")]
# )

# bert_model.fit(
#     train_dataset.shuffle(1000).batch(64),
#     validation_data=val_dataset.batch(64),
#     epochs=5
# )

In [25]:
# #Evaluate trained model on test dataset
# test_loss, test_accuracy = bert_model.evaluate(test_dataset.batch(64))

# print(f"Test Loss: {test_loss}")
# print(f"Test Accuracy: {test_accuracy}")

In [None]:
# #Save model
# bert_model.save_pretrained(r"D:\Coding\Twitter_Sentimental_Analysis\My_trained_bert_model")

In [2]:
#Load locally saved model
bert_model = TFBertForSequenceClassification.from_pretrained(r"D:\Coding\Twitter_Sentimental_Analysis\My_trained_bert_model")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at D:\Coding\Twitter_Sentimental_Analysis\My_trained_bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


### Test model on a single text

In [26]:
#Preprocess text before predicting on it

def preprocess_text(text):
    # Remove http / https links
    text = re.sub(r'http\S+|https\S+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stoplist)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [32]:
#Define the custom string input
input_text = "I hate you"

#Preprocess input text
cleaned_text = preprocess_text(input_text)

#Tokenize cleaned text
encoded_input = tokenizer(
    cleaned_text,
    max_length=18,
    padding="max_length",
    truncation=True,
    return_tensors='tf'
)

#Get text tensor values
input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]
token_type_ids = encoded_input["token_type_ids"]

#Predict sentiment label for input
predictions = bert_model.predict([input_ids, attention_mask, token_type_ids])
logits = predictions.logits[0]
probabilities = tf.nn.softmax(logits)
predicted_label = tf.argmax(probabilities).numpy()
confidence_level = np.max(probabilities)

# Map the predicted label to its corresponding sentiment category
sentiment_categories = ["Negative", "Positive"]
predicted_sentiment = sentiment_categories[predicted_label]

# Print the predicted sentiment and confidence level
print("Predicted sentiment:", predicted_sentiment)
print("Confidence level:", confidence_level)

Predicted sentiment: Negative
Confidence level: 0.9916289


### Creating API call for hosted model

In [3]:
# Define URL for the API endpoint
url = 'http://localhost:8501/v1/models/sentiment_analysis:predict'

def predict_sentiment_api(text):
    def make_prediction_api(instances):
        data = json.dumps({"signature_name":"serving_default", "instances": instances})
        headers = {"content-type": "application/json"}
        json_response = requests.post(url, data=data, headers=headers)
        predictions = json.loads(json_response.text)
        return predictions
    
    #Clean input text
    text = re.sub(r'http\S+|https\S+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join(word for word in text.split() if word not in stoplist)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize the cleaned text
    encoded_input_api = tokenizer(
        text,
        max_length = 18,
        padding="max_length",
        truncation = True,
        return_tensors = 'tf'
    )

    # Get text tensor values
    input_ids = encoded_input_api["input_ids"]
    attention_mask = encoded_input_api["attention_mask"]
    token_type_ids = encoded_input_api["token_type_ids"]

    # Prepare instance for API call
    instances_api = [{
        "input_ids": input_ids.numpy().tolist()[0],
        "attention_mask": attention_mask.numpy().tolist()[0],
        "token_type_ids": token_type_ids.numpy().tolist()[0]
    }]


    #Make api call
    result_api = make_prediction_api(instances_api)

    return result_api


In [4]:
# Define custom string input
input_text_api = 'this is here, that is there, but i dont like it'

# Make api call
result_api = predict_sentiment_api(input_text_api)

print("Result from call: ",result_api)
logits = result_api['predictions'][0]
probabilities = tf.nn.softmax(logits)
predicted_label = tf.argmax(probabilities).numpy()
confidence_level = np.max(probabilities)

# Map the predicted label to its corresponding sentiment category
sentiment_categories = ["Negative", "Positive"]
predicted_sentiment = sentiment_categories[predicted_label]

# Print the predicted sentiment and confidence level
print("Predicted sentiment:", predicted_sentiment)
print("Confidence level:", confidence_level)

Result from call:  {'predictions': [[3.4687202, -3.55189681]]}
Predicted sentiment: Negative
Confidence level: 0.99910754
