In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install emoji

In [None]:
!pip install ekphrasis

In [None]:
pip install plotly==4.5.4

In [None]:
!pip install transformers==4.2.1

In [5]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import re
import os

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [7]:
def print_text(texts,i,j):
    for u in range(i,j):
        print(texts[u])
        print()

In [None]:
df = pd.read_csv('/content/drive/My Drive/offenseval/olid-training-v1.0.tsv',delimiter='\t',encoding='utf-8')
print(list(df.columns.values)) #file header
print(df.head(5)) #last N rows

In [11]:
df.replace(np.NaN, 'NA', inplace=True)

In [12]:
df.head(5)

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [None]:
text_array = df["tweet"]
labels = df["subtask_a"]
labels_target = df["subtask_b"]
print(len(text_array))
print_text(text_array,0,10)

In [14]:
original = text_array

In [15]:
from collections import Counter

In [None]:
df_test_labels_b = pd.read_csv('/content/drive/My Drive/offenseval/labels-levelb.csv', header=None)
print(len(df_test_labels_b))
lol = df_test_labels_b[1]
print(Counter(lol))
df_test_labels_b.head(5)

In [21]:
labels_target_test = []

In [None]:
df_test_text = pd.read_csv('/content/drive/My Drive/offenseval/testset-levela.tsv',delimiter='\t',encoding='utf-8')
print(list(df_test_text.columns.values)) #file header
print(df_test_text.head(5)) #first N rows

df_test_labels = pd.read_csv('/content/drive/My Drive/offenseval/labels-levela.csv', header=None)
print(list(df_test_labels.columns.values))
print(df_test_labels.head(5))

count = 0
j = 0
for i in range(0,len(df_test_text["id"])):
    if df_test_labels[1][i] == "OFF":
        if df_test_labels[0][i] == df_test_labels_b[0][j]:
            labels_target_test.append(df_test_labels_b[1][j])
            j = j + 1
    else:
        labels_target_test.append("NA")

print(len(df_test_text["id"]))        
print(count)

text_array_test = df_test_text["tweet"]
labels_test = df_test_labels[1]
print("Checking length of validation set")
print(len(text_array_test),len(labels_test))

In [None]:
original_test = text_array_test

In [None]:
Counter(labels_target_test)

In [None]:
#removing website names
def remove_website(text):
    return " ".join([word if re.search("r'https?://\S+|www\.\S+'|((?i).com$|.co|.net)",word)==None else "" for word in text.split(" ") ])

# Training set 
text_array = text_array.apply(lambda text: remove_website(text))
print_text(text_array,0,10)

print("**************************************************************************")

# Validation set 
text_array_test = text_array_test.apply(lambda text: remove_website(text))
print_text(text_array_test,0,10)

In [None]:
# Functions for chat word conversion
f = open("/content/drive/My Drive/offenseval/slang.txt", "r")
chat_words_str = f.read()
chat_words_map_dict = {}
chat_words_list = []

for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [None]:
# Chat word conversion
# Training set
text_array = text_array.apply(lambda text: chat_words_conversion(text))
print_text(text_array,0,10)
print_text(original,0,10)

print("********************************************************************************")

# Validation set
text_array_test = text_array_test.apply(lambda text: chat_words_conversion(text))
print_text(text_array_test,0,10)

In [None]:
os.chdir("/content/drive/My Drive/offenseval")
print(os.getcwd())

In [None]:
#Function for emoticon conversion
from emoticons import EMOTICONS

def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', " ".join(EMOTICONS[emot].replace(",","").split()), text)
    return text


#testing the emoticon function
text = "Hello :-) :-)"
text = convert_emoticons(text)
print(text + "\n")

In [None]:
# Emoticon conversion
# Training set
text_array = text_array.apply(lambda text: convert_emoticons(text))
print_text(text_array,0,10)

print("**********************************************************************************")

# Validation set
text_array_test = text_array_test.apply(lambda text: convert_emoticons(text))
print_text(text_array_test,0,10)

In [None]:
# FUnction for removal of emoji
import emoji

def convert_emojis(text):
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub("_|-"," ",text)
    return text

# Training set
text_array = text_array.apply(lambda text: convert_emojis(text))
print_text(text_array,0,10)

print("**************************************************************************")

# Validation set
text_array_test = text_array_test.apply(lambda text: convert_emojis(text))
print_text(text_array_test,0,10)

In [None]:
os.chdir("/content")
print(os.getcwd())

In [None]:
# Ekphrasis pipe for text pre-processing
def ekphrasis_pipe(sentence):
    cleaned_sentence = " ".join(text_processor.pre_process_doc(sentence))
    return cleaned_sentence

# Training set
text_array = text_array.apply(lambda text: ekphrasis_pipe(text))
print("Training set completed.......")
#Validation set
text_array_test = text_array_test.apply(lambda text: ekphrasis_pipe(text))
print("Test set completed.......")

In [None]:
print_text(text_array,0,10)
print("************************************************************************")
print_text(text_array_test,0,10)

In [None]:
# Removing unnecessary punctuations
PUNCT_TO_REMOVE = "\"$%&'()+,-./;=[\]^_`{|}~"
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

# Training set
text_array = text_array.apply(lambda text: remove_punctuation(text))
print_text(text_array,0,10)

print("********************************************************************")

# Validation set
text_array_test = text_array_test.apply(lambda text: remove_punctuation(text))
print_text(text_array_test,0,10)

In [None]:
# print_text(text_array,3550,3555)
print_text(original,9540,9555)

In [None]:
# Finding length of longest array
maxLen = len(max(text_array,key = lambda text: len(text.split(" "))).split(" "))
print(maxLen)

In [None]:
u = lambda text: len(text.split(" "))
sentence_lengths = []
for x in text_array:
    sentence_lengths.append(u(x))
print(sorted(sentence_lengths)[-100:])
print(len(sentence_lengths))

<h2>Text pre-processing complete</h2>

In [None]:
# Count of each label in dataset
from collections import Counter

# Printing training set counts for analysis
print("Elements: ",set(labels))
print("Length: ",len(labels))
print(Counter(labels))

print("**************************************************************************")

# Printing validation set counts for analysis
print("Elements: ",set(labels_test))
print("Length: ",len(labels_test))
print(Counter(labels_test))

In [None]:
Y = []
Y_test = []

# Training set    
for i in range(0,len(labels)):
    if(labels[i] == "OFF"):
        Y.append(0)
    if(labels[i] == "NOT"):
        Y.append(1)

# Validation set
for i in range(0,len(labels_test)):
    if(labels_test[i] == "OFF"):
        Y_test.append(0)
    if(labels_test[i] == "NOT"):
        Y_test.append(1)

In [None]:
Y_target = []
Y_target_test = []

# Training set    
for i in range(0,len(labels_target)):
    if(labels_target[i] == "NA"):
        Y_target.append(0)
    if(labels_target[i] == "TIN"):
        Y_target.append(1)
    if(labels_target[i] == "UNT"):
        Y_target.append(2)

# Validation set
for i in range(0,len(labels_target_test)):
    if(labels_target_test[i] == "NA"):
        Y_target_test.append(0)
    if(labels_target_test[i] == "TIN"):
        Y_target_test.append(1)
    if(labels_target_test[i] == "UNT"):
        Y_target_test.append(2)

In [None]:
# Testing the conversion into integers
for i in range(200,210):
    print(text_array_test[i])
    print(labels_test[i],Y_test[i])
    print(labels_target_test[i],Y_target_test[i])
    print()

In [None]:
# Verifying train set 
X = np.asarray(list(text_array))
Y = np.asarray(list(Y))
Y_target = np.asarray(list(Y_target))
print(type(X))
print(type(Y))
print(type(Y_target))
print(np.shape(X),np.shape(Y),np.shape(Y_target))

# Verifying validation set
X_test = np.asarray(list(text_array_test))
Y_test = np.asarray(list(Y_test))
Y_target_test = np.asarray(list(Y_target_test))
print(type(X_test))
print(type(Y_test))
print(type(Y_target_test))
print(np.shape(X_test),np.shape(Y_test),np.shape(Y_target_test))

In [None]:
print(Counter(Y))
print(Counter(Y_test))

In [None]:
print(X_test[0])
print(Y_test[0])
print(labels_test[0])
print(Y_target_test[0])
print(labels_target_test[0])

<h2>Shuffling training and validation data</h2>

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
print(Counter(labels))
print(Counter(labels_test))
print(Counter(labels_target))
print(Counter(labels_target_test))

In [None]:
# Converting to one hot vectors
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)] #u[Y] helps to index each element of Y index at u. U here is a class array
    return Y

In [None]:
Y_oh_train = convert_to_one_hot(np.array(Y), C = 2)
Y_oh_test = convert_to_one_hot(np.array(Y_test), C = 2)

Y_oh_target_train = convert_to_one_hot(np.array(Y_target), C = 3)
Y_oh_target_test = convert_to_one_hot(np.array(Y_target_test), C = 3)
print(np.shape(Y_oh_train))
print(np.shape(Y_oh_target_test))
index = 0
print(labels[index], Y[index], "is converted into one hot", Y_oh_train[index])
print(labels_target[index], Y_target[index], "is converted into one hot", Y_oh_target_train[index])

<h2>Model using BERT</h2>

In [None]:
import tensorflow as tf
import os
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import re
import os
from collections import Counter

In [None]:
from transformers import RobertaTokenizerFast, TFRobertaModel, TFBertModel, BertTokenizerFast, ElectraTokenizerFast, TFElectraModel, AlbertTokenizerFast, TFAlbertModel, XLNetTokenizerFast, TFXLNetModel, MPNetTokenizerFast, TFMPNetModel
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import RepeatVector, Concatenate, Dense, Activation, Dot, BatchNormalization, Dropout

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [None]:
print(tf.__version__)

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
X = list(X)
X_test = list(X_test)

In [None]:
model_train_x, model_val_x, Y_train, Y_val = train_test_split(X, Y, test_size=0.05, random_state=44)

In [None]:
train_encodings = tokenizer(model_train_x, max_length=100, truncation=True, padding="max_length", return_tensors='tf')
val_encodings = tokenizer(model_val_x, max_length=100, truncation=True, padding="max_length", return_tensors='tf')
test_encodings = tokenizer(X_test, max_length=100, truncation=True, padding="max_length", return_tensors='tf')

In [None]:
cluster_encodings = tokenizer(X, max_length=100, truncation=True, padding="max_length", return_tensors='tf')

In [None]:
print(np.shape(train_encodings["input_ids"]))
print(np.shape(val_encodings["input_ids"]))
print(np.shape(test_encodings["input_ids"]))
print(np.shape(cluster_encodings["input_ids"]))

In [None]:
print(train_encodings["input_ids"][0])
print("***************************************************************************")
print(val_encodings["input_ids"][0])
print("***************************************************************************")
print(test_encodings["input_ids"][0])

<h3> Subtask A</h3>

In [None]:
def Offense_classifier(input_shape):
    """
    Function creating the model's graph.
    
    Arguments:
    input_shape -- shape of the input,(max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 100-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (13 million words)

    Returns:
    model -- a model instance in Keras
    """
    model = TFBertModel.from_pretrained('bert-base-uncased')
    layer = model.layers[0]

    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    inputs = keras.Input(shape=input_shape, dtype='int32')
    input_masks = keras.Input(shape=input_shape, dtype='int32')
    
    embeddings = layer([inputs, input_masks])[1]

    X = BatchNormalization(momentum=0.99, epsilon=0.001, center=True, scale=True)(embeddings)
    
    # Add dropout with a probability of 0.1
    X = Dropout(0.1)(X)
    
    X = Dense(128,activation='elu',kernel_regularizer=keras.regularizers.l2(0.001))(X)

    X = Dense(32,activation='elu',kernel_regularizer=keras.regularizers.l2(0.001))(X)

    X = Dense(3,activation='elu',kernel_regularizer=keras.regularizers.l2(0.01))(X)

    X = Dense(32,activation='elu',kernel_regularizer=keras.regularizers.l2(0.001))(X)

    X = BatchNormalization(momentum=0.99, epsilon=0.001, center=True, scale=True)(X)

    X = Dense(128,activation='elu',kernel_regularizer=keras.regularizers.l2(0.001))(X)

    X = Dense(1,activation='linear',kernel_regularizer=keras.regularizers.l2(0.01))(X)
    
    # Add a sigmoid activation
    X = Activation('sigmoid')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = keras.Model(inputs=[inputs,input_masks], outputs=[X])
    
    
    return model

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
class EvaluationMetric(keras.callbacks.Callback):   
    
    def __init__(self, trial_encodings, trial_masks, Y_test):
        super(EvaluationMetric, self).__init__()
        self.trial_encodings = trial_encodings
        self.trial_masks = trial_masks
        self.Y_test = Y_test
    
    def on_epoch_begin(self, epoch, logs={}):
        print("\nTraining...")

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating...")
        trial_prediction = self.model.predict([self.trial_encodings,self.trial_masks])
        
        pred = []
        for i in range(0,len(self.Y_test)):
            num = trial_prediction[i]
            if(num > 0.5):
              num = 1
            else:
              num = 0
            pred.append(num)
        
        from sklearn.metrics import classification_report
        print(classification_report(Y_test, pred, digits=3))
        
evaluation_metric = EvaluationMetric(test_encodings["input_ids"], test_encodings["attention_mask"], Y_test)

In [None]:
with strategy.scope():
    model = Offense_classifier((100,))
    optimizer = keras.optimizers.Adam(learning_rate=5e-5)
    loss_fun = [
          tf.keras.losses.BinaryCrossentropy()
    ]
    metric = ['acc']
    model.compile(optimizer=optimizer, loss=loss_fun, metrics=metric)

In [None]:
model.summary()

In [None]:
neg, pos = np.bincount(Y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

In [None]:
class_weight = {}
maxi = max(neg, pos)
weight_for_0 = (maxi / (maxi + neg)) 
weight_for_1 = (maxi / (maxi + pos))

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(filepath='/content/olid_bert.{epoch:03d}.h5',
                                 monitor='val_acc',
                                 verbose=1,
                                 save_weights_only=True,
                                 period=1)

In [None]:
print(Counter(Y))
print(Counter(Y_test))

In [None]:
print(Counter(Y_train))
print(Counter(Y_val))

In [None]:
print(len(train_encodings["input_ids"]),len(val_encodings["input_ids"]))

In [None]:
# val 0.05
history = model.fit(
    x = [train_encodings["input_ids"], train_encodings["attention_mask"]],
    y = Y_train,
    validation_data = ([val_encodings["input_ids"],val_encodings["attention_mask"]],Y_val),
    callbacks = [evaluation_metric, checkpoint],
    batch_size = 64,
    shuffle=True,
    epochs=6,
    class_weight = class_weight
)

<h4>Training Curves</h4>

In [None]:
history = history
import matplotlib.pyplot as plt
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.show()

In [None]:
# model.load_weights("/content/drive/MyDrive/OLID Transformer weights/olid_bert(0.05).003.h5")

In [None]:
# model.save_weights("/content/drive/MyDrive/OLID Transformer weights/olid_bert(0.05).003.h5")

<h4>Test Set Statistics</h4>

In [None]:
answer = model.predict([test_encodings["input_ids"], test_encodings["attention_mask"]])

In [None]:
pred = []
sample = df_test_text["tweet"]
count = 0
for i in range(0,len(X_test)):

    num = answer[i]
    if(num > 0.5):
      num = 1
    else:
      num = 0
    pred.append(num)

print(count)

In [None]:
con_mat = tf.math.confusion_matrix(labels=Y_test, predictions=pred)
print(con_mat)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
figure = plt.figure(figsize=(8, 8))
sns.set(font_scale=1.75)
sns.heatmap(con_mat, annot=True,cmap=plt.cm.viridis,fmt='d', xticklabels=["Offensive","Not Offensive"], yticklabels=["Offensive","Not Offensive"],annot_kws={"size": 15})
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
from sklearn.metrics import f1_score, classification_report

In [None]:
f1_score(Y_test, pred, average='macro')

In [None]:
print(classification_report(Y_test, pred, target_names=["offensive", "not offensive"], digits=3))

<h3>Train set analysis</h3>

In [None]:
answer_train = model.predict([cluster_encodings["input_ids"], cluster_encodings["attention_mask"]])

In [None]:
pred = []
sample = original
count = 0
for i in range(0,len(Y)):

    num = answer_train[i]
    lol = num
    if(num > 0.5):
      num = 1
    else:
      num = 0
    pred.append(num)
    if(num != Y[i] and Y[i] == 0 and lol >=0.8):
        print(answer_train[i])
        print("Original label: ",labels[i])
        print("Without pre-processing: ",sample[i])
        print("With pre-processing: ",X[i])
        lol = ""
        count += 1

        if(num == 0):
            lol = "Offensive"
        if(num == 1):
            lol = "Not Offensive"
        print("Predicted: " + lol)
        print()

print(count)

<h3>Training examination</h3>

In [None]:
import plotly
import plotly.graph_objs as go
import plotly.express as px

In [None]:
# 3 neuron output
model.layers[-6].name

In [None]:
cluster_dense_3 = keras.Model(inputs=model.input, outputs=model.layers[-6].output)
with strategy.scope():
    cluster_3 = cluster_dense_3.predict([cluster_encodings["input_ids"], cluster_encodings["attention_mask"]])

In [None]:
pred_train = []
temp = 0

for i in range(0,len(Y)):

    num = answer_train[i]

    if(num >= 0.5):
      num = 1
    else:
      num = 0
    pred_train.append(num)

print(temp)

0


In [None]:
flag = []
count = 0

x_ = []
y_ = []
z_ = []


for i in range(0,len(Y)):
    count = count + 1
    x_.append(cluster_3[i][2])
    y_.append(cluster_3[i][0])
    z_.append(cluster_3[i][1])

    if( answer_train[i] > 0.3 and answer_train[i] < 0.7 ):
        flag.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.3 ):
        flag.append(0)

    if( answer_train[i] >= 0.7 and answer_train[i] < 1 ):
        flag.append(1)


In [None]:
Counter(flag)

In [None]:
con_mat = tf.math.confusion_matrix(labels=Y, predictions=pred_train)
print(con_mat)

In [None]:
pred_colour = []
for i in range(0,len(flag)):
    if flag[i] == 2:
      pred_colour.append("Neutral")
    if flag[i] == 1:
      pred_colour.append("Not Offensive")
    if flag[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'x':x_, 'y':y_, 'z':z_, 'Labels':pred_colour})

fig = px.scatter_3d(test_df, x='x', y='y', z='z', color='Labels')
fig.update_traces(
    marker={
        'size': 1,
        'opacity': 0.7,
        'colorscale' : 'Oryel',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant', 'font_size':18}, font_size=15, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 500)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
pred_colour = []
for i in range(0,len(flag)):
    if pred_train[i] == 1:
      pred_colour.append("Not Offensive")
    if pred_train[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'X':x_, 'Y':y_, 'Z':z_, 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='X', y='Y', z='Z', color='Labels:')
fig.update_traces(
    marker={
        'size': 1,
        'opacity': 1,
        'colorscale' : 'rainbow',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant'}, font_size=14, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

<h3>Traning examination end</h3>

<h1>CLUSTERING</h1>

<h3>BERT PLM layer</h3>

<h5>k-means PLM</h5>

In [None]:
model.layers[-8].name

In [None]:
cluster_bert = keras.Model(inputs=model.input, outputs=model.layers[-8].output)
with strategy.scope():
    cl_bert = cluster_bert.predict([test_encodings["input_ids"], test_encodings["attention_mask"]])

In [None]:
len(cl_bert)

In [None]:
flag_bert = []
count = 0

x_bert = []
y_bert = []
z_bert = []


for i in range(0,len(Y)):
    count = count + 1
    x_bert.append(cl_bert[i][0])
    y_bert.append(cl_bert[i][1])
    z_bert.append(cl_bert[i][2])

    if( answer_train[i] > 0.28 and answer_train[i] < 0.8 ):
        flag_bert.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.28 ):
        flag_bert.append(0)

    if( answer_train[i] >= 0.8 and answer_train[i] < 1 ):
        flag_bert.append(1)

print(count)

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans_bert = KMeans(n_clusters=3, random_state=44).fit(cl_bert)
y_kmeans_bert = kmeans_bert.predict(cl_bert)

In [None]:
Counter(y_kmeans_bert)

In [None]:
Counter(flag_bert)

Counter({0: 3320, 1: 7367, 2: 2553})

In [None]:
# 1 index values are offensive
# 0 index values are not offensive
# 2 index values are neutral

count = 0
for i in range(0,len(flag_bert)):
      if flag_bert[i] == 1 and y_kmeans_bert[i] == 0:
        count = count + 1
print(count)

In [None]:
for i in range(0,len(flag_bert)):
    if(y_kmeans_bert[i] == 0):
      y_kmeans_bert[i] = 1
    elif(y_kmeans_bert[i] == 1):
      y_kmeans_bert[i] = 0
    else:
      y_kmeans_bert[i] = 2

In [None]:
flag_bert = []
count = 0

x_bert = []
y_bert = []
z_bert = []


for i in range(0,len(Y)):
    count = count + 1
    x_bert.append(cl_bert[i][0])
    y_bert.append(cl_bert[i][1])
    z_bert.append(cl_bert[i][2])

    if( answer_train[i] > 0.3 and answer_train[i] < 0.7 ):
        flag_bert.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.3 ):
        flag_bert.append(0)

    if( answer_train[i] >= 0.7 and answer_train[i] < 1 ):
        flag_bert.append(1)
        

print(count)

In [None]:
Counter(flag_bert)

In [None]:
con_mat = tf.math.confusion_matrix(labels=flag_bert, predictions=y_kmeans_bert)
print(con_mat)

In [None]:
import sklearn
print(sklearn.metrics.classification_report(flag_bert, y_kmeans_bert, output_dict=False, digits=3))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

In [None]:
centers_bert = kmeans_bert.cluster_centers_

In [None]:
svns_off = []
for i in range(0,len(Y_test)):
    off = cosine(cl_bert[i], centers_bert[1])/2
    svns_off.append(1-off)
print(len(svns_off))

In [None]:
svns_noff = []
for i in range(0,len(Y_test)):
    noff = cosine(cl_bert[i], centers_bert[0])/2
    svns_noff.append(1-noff)
print(len(svns_noff))

In [None]:
svns_neu = []
for i in range(0,len(Y_test)):
    neu = cosine(cl_bert[i], centers_bert[2])/2
    svns_neu.append(1-neu)
print(len(svns_neu))

In [None]:
import plotly
import plotly.graph_objs as go
import plotly.express as px

<p>k-means PLM Plot</p>

In [None]:
pred_colour = []
for i in range(0,len(y_kmeans_bert)):
    if y_kmeans_bert[i] == 2:
      pred_colour.append("Neutral")
    if y_kmeans_bert[i] == 1:
      pred_colour.append("Not Offensive")
    if y_kmeans_bert[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'SVNS Offensive':svns_off, 'SVNS Not Offensive':svns_noff, 'SVNS Neutral':svns_neu, 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='SVNS Offensive', y='SVNS Not Offensive', z='SVNS Neutral', color='Labels:')
fig.update_traces(
    marker={
        'size': 1,
        'opacity': 1,
        'colorscale' : 'viridis',
    },
)
fig.update_layout(legend= {'itemsizing': 'constant'}, font_size=14, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
pred_kbert = []
for i in range(0,len(Y_test)):
    if(svns_off[i] > svns_noff[i]):
        pred_kbert.append(0)
    else:
        pred_kbert.append(1)
print(classification_report(Y_test, pred_kbert, output_dict=False, digits=3))

In [None]:
con_mat = tf.math.confusion_matrix(labels=Y_test, predictions=pred_kbert)
print(con_mat)

<p> GMM model PLM </p>

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
gmm_bert = GaussianMixture(n_components=3, random_state = 44).fit(cl_bert)

In [None]:
mean_bert = gmm_bert.means_
cov_bert = gmm_bert.covariances_
print(np.shape(mean_bert))
print(np.shape(cov_bert))

In [None]:
labels_bert = gmm_bert.predict(cl_bert)

In [None]:
flag_bert = []
count = 0

x_bert = []
y_bert = []
z_bert = []


for i in range(0,len(X)):
    count = count + 1
    x_bert.append(cl_bert[i][0])
    y_bert.append(cl_bert[i][1])
    z_bert.append(cl_bert[i][2])

    if( answer_train[i] > 0.28 and answer_train[i] < 0.8 ):
        flag_bert.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.28 ):
        flag_bert.append(0)

    if( answer_train[i] >= 0.8 and answer_train[i] < 1 ):
        flag_bert.append(1)

print(count)

In [None]:
Counter(flag_bert)

In [None]:
# 1 index values are offensive
# 0 index values are not offensive
# 2 index values are neutral

count = 0
for i in range(0,len(flag_bert)):
      if flag_bert[i] == 2 and labels_bert[i] == 0:
        count = count + 1
print(count)

In [None]:
for i in range(0,len(flag_bert)):
    if(labels_bert[i] == 0):
      labels_bert[i] = 2
    elif(labels_bert[i] == 1):
      labels_bert[i] = 1
    else:
      labels_bert[i] = 0

In [None]:
flag_bert = []
count = 0

x_bert = []
y_bert = []
z_bert = []


for i in range(0,len(X)):
    count = count + 1
    x_bert.append(cl_bert[i][0])
    y_bert.append(cl_bert[i][1])
    z_bert.append(cl_bert[i][2])

    if( answer_train[i] > 0.3 and answer_train[i] < 0.7 ):
        flag_bert.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.3 ):
        flag_bert.append(0)

    if( answer_train[i] >= 0.7 and answer_train[i] < 1 ):
        flag_bert.append(1)

print(count)

In [None]:
con_mat = tf.math.confusion_matrix(labels=flag_bert, predictions=labels_bert)
print(con_mat)

In [None]:
import sklearn
print(sklearn.metrics.classification_report(flag_bert, labels_bert, output_dict=False, digits=3))

In [None]:
prob_bert = gmm_bert.predict_proba(cl_bert)
prob_bert = prob_bert.T

<p>GMM PLM  Plot</p>

In [None]:
pred_colour = []
for i in range(0,len(y_kmeans_bert)):
    if labels_bert[i] == 2:
      pred_colour.append("Neutral")
    if labels_bert[i] == 1:
      pred_colour.append("Not Offensive")
    if labels_bert[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'SVNS Offensive':prob_bert[2], 'SVNS Non Offensive':prob_bert[1], 'SVNS Neutral':prob_bert[0], 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='SVNS Offensive', y='SVNS Non Offensive', z='SVNS Neutral', color='Labels:')
fig.update_traces(
    marker={
        'size': 1.8,
        'opacity': 1,
        'colorscale' : 'viridis',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant'}, font_size=14, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

<h3>Dense 3 layer</h3>

In [None]:
from sklearn.preprocessing import normalize

In [None]:
cl_norm = normalize(cluster_3, norm='l2', axis=1)

In [None]:
flag_3 = []
count = 0

x_ = []
y_ = []
z_ = []


for i in range(0,len(X)):
    count = count + 1
    x_.append(cl_norm[i][0])
    y_.append(cl_norm[i][1])
    z_.append(cl_norm[i][2])

    if( answer_train[i] > 0.28 and answer_train[i] < 0.8 ):
        flag_3.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.28 ):
        flag_3.append(0)

    if( answer_train[i] >= 0.8 and answer_train[i] < 1 ):
        flag_3.append(1)

print(count)

<p>k-means Dense 3</p>

In [None]:
kmeans_3 = KMeans(n_clusters=3, random_state=44).fit(cl_norm)
y_kmeans_3 = kmeans_3.predict(cl_norm)

In [None]:
Counter(y_kmeans_3)

In [None]:
Counter(flag_3)

In [None]:
# 1 index values are offensive
# 0 index values are not offensive
# 2 index values are neutral

count = 0
for i in range(0,len(flag_3)):
      if flag_3[i] == 2 and y_kmeans_3[i] == 0:
        count = count + 1
print(count)

In [None]:
for i in range(0,len(flag_3)):
    if(y_kmeans_3[i] == 0):
      y_kmeans_3[i] = 0
    elif(y_kmeans_3[i] == 1):
      y_kmeans_3[i] = 1
    else:
      y_kmeans_3[i] = 2

In [None]:
flag_3 = []
count = 0

x_ = []
y_ = []
z_ = []


for i in range(0,len(X)):
    count = count + 1
    x_.append(cl_norm[i][2])
    y_.append(cl_norm[i][1])
    z_.append(cl_norm[i][0])

    if( answer_train[i] > 0.3 and answer_train[i] < 0.7 ):
        flag_3.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.3 ):
        flag_3.append(0)

    if( answer_train[i] >= 0.7 and answer_train[i] < 1 ):
        flag_3.append(1)

print(count)

In [None]:
Counter(flag_3)

In [None]:
con_mat = tf.math.confusion_matrix(labels=flag_3, predictions=y_kmeans_3)
print(con_mat)

In [None]:
import sklearn
print(sklearn.metrics.classification_report(flag_3, y_kmeans_3, output_dict=False, digits=3))

<p>Transition phase</p>

In [None]:
pred_colour = []
for i in range(0,len(y_kmeans_bert)):
    if y_kmeans_3[i] == 2:
      pred_colour.append("Neutral")
    if y_kmeans_3[i] == 1:
      pred_colour.append("Not Offensive")
    if y_kmeans_3[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'X':x_, 'Y':y_, 'Z':z_, 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='X', y='Y', z='Z', color='Labels:')
fig.update_traces(
    marker={
        'size': 1,
        'opacity': 1,
        'colorscale' : 'viridis',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant'}, font_size=14, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

<p>Original predictions</p>

In [None]:
pred_colour = []
for i in range(0,len(y_kmeans_bert)):
    if pred_train[i] == 1:
      pred_colour.append("Not Offensive")
    if pred_train[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'X':x_, 'Y':y_, 'Z':z_, 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='X', y='Y', z='Z', color='Labels:')
fig.update_traces(
    marker={
        'size': 1,
        'opacity': 1,
        'colorscale' : 'rainbow',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant'}, font_size=14, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

<h4>End of transition capture</h4> 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

In [None]:
centers_3 = kmeans_3.cluster_centers_
print(centers_3)

In [None]:
svns_off = []
for i in range(0,len(Y)):
    off = cosine(cl_norm[i], centers_3[0])/2
    svns_off.append(1-off)
print(len(svns_off))

In [None]:
svns_noff = []
for i in range(0,len(Y)):
    noff = cosine(cl_norm[i], centers_3[1])/2
    svns_noff.append(1-noff)
print(len(svns_noff))

In [None]:
svns_neu = []
for i in range(0,len(Y)):
    neu = cosine(cl_norm[i], centers_3[2])/2
    svns_neu.append(1-neu)
print(len(svns_neu))

<h5>k-means Dense 3 plot</h5>

In [None]:
pred_colour = []
for i in range(0,len(Y)):
    if y_kmeans_3[i] == 2:
      pred_colour.append("Neutral")
    if y_kmeans_3[i] == 1:
      pred_colour.append("Not Offensive")
    if y_kmeans_3[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'SVNS Offensive':svns_off, 'SVNS Not Offensive':svns_noff, 'SVNS Neutral':svns_neu, 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='SVNS Offensive', y='SVNS Not Offensive', z='SVNS Neutral', color='Labels:')
fig.update_traces(
    marker={
        'size': 1,
        'opacity': 1,
        'colorscale' : 'viridis',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant', 'font_size':18}, font_size=15, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
pred_dbert = []
for i in range(0,len(Y_test)):
    if(svns_off[i] > svns_noff[i]):
        pred_dbert.append(0)
    else:
        pred_dbert.append(1)
print(classification_report(Y_test, pred_dbert, output_dict=False, digits=3))

In [None]:
con_mat = tf.math.confusion_matrix(labels=Y_test, predictions=pred_dbert)
print(con_mat)

<p> GMM model Dense 3</p>

In [None]:
gmm_3 = GaussianMixture(n_components=3, random_state = 44).fit(cl_norm)

In [None]:
mean_norm = gmm_3.means_
cov_norm = gmm_3.covariances_
print(np.shape(mean_norm))
print(np.shape(cov_norm))

In [None]:
labels_norm = gmm_3.predict(cl_norm)

In [None]:
flag_3 = []
count = 0

x_ = []
y_ = []
z_ = []


for i in range(0,len(X)):
    count = count + 1
    x_.append(cl_norm[i][2])
    y_.append(cl_norm[i][1])
    z_.append(cl_norm[i][0])

    if( answer_train[i] > 0.28 and answer_train[i] < 0.8 ):
        flag_3.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.28 ):
        flag_3.append(0)

    if( answer_train[i] >= 0.8 and answer_train[i] < 1 ):
        flag_3.append(1)

print(count)

In [None]:
Counter(labels_norm)

In [None]:
Counter(flag_3)

In [None]:
# 1 index values are offensive
# 0 index values are not offensive
# 2 index values are neutral

count = 0
for i in range(0,len(flag_3)):
      if flag_3[i] == 2 and labels_norm[i] == 0:
        count = count + 1
print(count)

In [None]:
for i in range(0,len(flag_3)):
    if(labels_norm[i] == 0):
      labels_norm[i] = 2
    elif(labels_norm[i] == 1):
      labels_norm[i] = 1
    else:
      labels_norm[i] = 0

In [None]:
flag_3 = []
count = 0

x_ = []
y_ = []
z_ = []


for i in range(0,len(X)):
    count = count + 1
    x_.append(cl_norm[i][2])
    y_.append(cl_norm[i][1])
    z_.append(cl_norm[i][0])

    if( answer_train[i] > 0.3 and answer_train[i] < 0.7 ):
        flag_3.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.3 ):
        flag_3.append(0)

    if( answer_train[i] >= 0.7 and answer_train[i] < 1 ):
        flag_3.append(1)

print(count)

In [None]:
con_mat = tf.math.confusion_matrix(labels=flag_3, predictions=labels_norm)
print(con_mat)

In [None]:
import sklearn
print(sklearn.metrics.classification_report(flag_3, labels_norm, output_dict=False, digits=3))

In [None]:
prob_norm = gmm_3.predict_proba(cl_norm)
prob_norm = prob_norm.T

<h5> GMM dense 3 plot</h5>

In [None]:
pred_colour = []
for i in range(0,len(Y)):
    if labels_norm[i] == 2:
      pred_colour.append("Neutral")
    if labels_norm[i] == 1:
      pred_colour.append("Not Offensive")
    if labels_norm[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'SVNS Offensive':prob_norm[2], 'SVNS Not Offensive':prob_norm[1], 'SVNS Neutral':prob_norm[0], 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='SVNS Offensive', y='SVNS Not Offensive', z='SVNS Neutral', color='Labels:')
fig.update_traces(
    marker={
        'size': 1.5,
        'opacity': 1,
        'colorscale' : 'viridis',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant', 'font_size':18}, font_size=15, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

<h3>Dense 3 layer end</h3>

<h3>Batch Norm layer</h3>

In [None]:
model.layers[-4].name

In [None]:
cluster_32 = keras.Model(inputs=model.input, outputs=model.layers[-4].output)
with strategy.scope():
    cl_32 = cluster_32.predict([test_encodings["input_ids"], test_encodings["attention_mask"]])

In [None]:
flag_32 = []
count = 0

x_32 = []
y_32 = []
z_32 = []


for i in range(0,len(X)):
    count = count + 1
    x_32.append(cl_32[i][0])
    y_32.append(cl_32[i][1])
    z_32.append(cl_32[i][2])

    if( answer_train[i] > 0.28 and answer_train[i] < 0.8 ):
        flag_32.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.28 ):
        flag_32.append(0)

    if( answer_train[i] >= 0.8 and answer_train[i] < 1 ):
        flag_32.append(1)

print(count)

<p>k-means BatchNorm</p>

In [None]:
kmeans_32 = KMeans(n_clusters=3, random_state=44).fit(cl_32)
y_kmeans_32 = kmeans_32.predict(cl_32)

In [None]:
Counter(y_kmeans_32)

In [None]:
Counter(flag_32)

In [None]:
# 2 index values are offensive
# 0 index values are not offensive
# 0 index values are neutral

count = 0
for i in range(0,len(y_kmeans_32)):
      if flag_32[i] == 0 and y_kmeans_32[i] == 0:
        count = count + 1
print(count)

In [None]:
for i in range(0,len(y_kmeans_32)):
    if(y_kmeans_32[i] == 0):
      y_kmeans_32[i] = 0
    elif(y_kmeans_32[i] == 1):
      y_kmeans_32[i] = 1
    else:
      y_kmeans_32[i] = 2

In [None]:
flag_32 = []
count = 0

x_32 = []
y_32 = []
z_32 = []


for i in range(0,len(X)):
    count = count + 1
    x_32.append(cl_32[i][0])
    y_32.append(cl_32[i][1])
    z_32.append(cl_32[i][2])

    if( answer_train[i] > 0.3 and answer_train[i] < 0.7 ):
        flag_32.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.3 ):
        flag_32.append(0)

    if( answer_train[i] >= 0.7 and answer_train[i] < 1 ):
        flag_32.append(1)

print(count)

In [None]:
con_mat = tf.math.confusion_matrix(labels=flag_32, predictions=y_kmeans_32)
print(con_mat)

In [None]:
import sklearn
print(sklearn.metrics.classification_report(flag_32, y_kmeans_32, output_dict=False, digits=3))

In [None]:
centers_32 = kmeans_32.cluster_centers_

In [None]:
svns_off = []
for i in range(0,len(Y_test)):
    off = cosine(cl_32[i], centers_32[0])/2
    svns_off.append(1-off)
print(len(svns_off))

860


In [None]:
svns_noff = []
for i in range(0,len(Y_test)):
    noff = cosine(cl_32[i], centers_32[1])/2
    svns_noff.append(1-noff)
print(len(svns_noff))

860


In [None]:
svns_neu = []
for i in range(0,len(Y_test)):
    neu = cosine(cl_32[i], centers_32[2])/2
    svns_neu.append(1-neu)
print(len(svns_neu))

860


<p>k-means BatchNorm Plot</p>

In [None]:
pred_colour = []
for i in range(0,len(y_kmeans_bert)):
    if y_kmeans_32[i] == 2:
      pred_colour.append("Neutral")
    if y_kmeans_32[i] == 1:
      pred_colour.append("Not Offensive")
    if y_kmeans_32[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'SVNS Offensive':svns_off, 'SVNS Not Offensive':svns_noff, 'SVNS Neutral':svns_neu, 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='SVNS Offensive', y='SVNS Not Offensive', z='SVNS Neutral', color='Labels:')
fig.update_traces(
    marker={
        'size': 1,
        'opacity': 1,
        'colorscale' : 'viridis',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant'}, font_size=14, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
pred_BNBert = []
for i in range(0,len(Y_test)):
    if(svns_off[i] > svns_noff[i]):
        pred_BNBert.append(0)
    else:
        pred_BNBert.append(1)
print(classification_report(Y_test, pred_BNBert, output_dict=False, digits=3))

In [None]:
con_mat = tf.math.confusion_matrix(labels=Y_test, predictions=pred_BNBert)
print(con_mat)

<p> GMM Model BatchNorm</p>

In [None]:
gmm_32 = GaussianMixture(n_components=3, random_state = 44).fit(cl_32)

In [None]:
mean_32 = gmm_32.means_
cov_32 = gmm_32.covariances_
print(np.shape(mean_32))
print(np.shape(cov_32))

In [None]:
labels_32 = gmm_32.predict(cl_32)

In [None]:
flag_32 = []
count = 0

x_32 = []
y_32 = []
z_32 = []


for i in range(0,len(X)):
    count = count + 1
    x_32.append(cl_32[i][0])
    y_32.append(cl_32[i][1])
    z_32.append(cl_32[i][2])

    if( answer_train[i] > 0.28 and answer_train[i] < 0.8 ):
        flag_32.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.28 ):
        flag_32.append(0)

    if( answer_train[i] >= 0.8 and answer_train[i] < 1 ):
        flag_32.append(1)

print(count)

In [None]:
Counter(flag_32)

In [None]:
# 1 index values are offensive
# 0 index values are not offensive
# 2 index values are neutral

count = 0
for i in range(0,len(flag_32)):
      if flag_32[i] == 1 and labels_32[i] == 1:
        count = count + 1
print(count)

In [None]:
for i in range(0,len(flag_32)):
    if(labels_32[i] == 0):
      labels_32[i] = 2
    elif(labels_32[i] == 1):
      labels_32[i] = 1
    else:
      labels_32[i] = 0

In [None]:
flag_32 = []
count = 0

x_32 = []
y_32 = []
z_32 = []


for i in range(0,len(X)):
    count = count + 1
    x_32.append(cl_32[i][0])
    y_32.append(cl_32[i][1])
    z_32.append(cl_32[i][2])

    if( answer_train[i] > 0.3 and answer_train[i] < 0.7 ):
        flag_32.append(2)
    
    if( answer_train[i] > 0 and answer_train[i] <= 0.3 ):
        flag_32.append(0)

    if( answer_train[i] >= 0.7 and answer_train[i] < 1 ):
        flag_32.append(1)

print(count)

In [None]:
con_mat = tf.math.confusion_matrix(labels=flag_32, predictions=labels_32)
print(con_mat)

In [None]:
import sklearn
print(sklearn.metrics.classification_report(flag_32, labels_32, output_dict=False, digits=3))

In [None]:
prob_32 = gmm_32.predict_proba(cl_32)
prob_32 = prob_32.T

<p>GMM BatchNorm Plot</p>

In [None]:
pred_colour = []
for i in range(0,len(y_kmeans_bert)):
    if labels_32[i] == 2:
      pred_colour.append("Neutral")
    if labels_32[i] == 1:
      pred_colour.append("Not Offensive")
    if labels_32[i] == 0:
      pred_colour.append("Offensive")

test_df = pd.DataFrame({'SVNS Offensive':prob_32[2], 'SVNS Not Offensive':prob_32[1], 'SVNS Neutral':prob_32[0], 'Labels:':pred_colour})

fig = px.scatter_3d(test_df, x='SVNS Offensive', y='SVNS Not Offensive', z='SVNS Neutral', color='Labels:')
fig.update_traces(
    marker={
        'size': 1.5,
        'opacity': 1,
        'colorscale' : 'viridis',
    }
)
fig.update_layout(legend= {'itemsizing': 'constant'}, font_size=14, scene_aspectmode='cube')
fig.update_layout(width = 850, height = 750)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))