In [None]:

# LSTM Model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
import nltk
import string
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import tensorflow as tf
from tensorflow import keras
from keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
warnings.filterwarnings('ignore')

####################
df = pd.read_csv('labeled_data.csv')
df.head()
####################

# Lower case all the words of the tweet before any preprocessing
df['tweet'] = df['tweet'].str.lower()
# Removing punctuations present in the text
punctuations_list = string.punctuation
def remove_punctuations(text):
	temp = str.maketrans('', '', punctuations_list)
	return text.translate(temp)

df['tweet']= df['tweet'].apply(lambda x: remove_punctuations(x))
df.head()
#########################################
def remove_stopwords(text):
	stop_words = stopwords.words('english')

	imp_words = []

	# Storing the important words
	for word in str(text).split():

		if word not in stop_words:

			# Let's Lemmatize the word as well
			# before appending to the imp_words list.

			lemmatizer = WordNetLemmatizer()
			lemmatizer.lemmatize(word)

			imp_words.append(word)

	output = " ".join(imp_words)

	return output


df['tweet'] = df['tweet'].apply(lambda text: remove_stopwords(text))
df.head()

##################################################

def plot_word_cloud(data, typ):
# Joining all the tweets to get the corpus
    email_corpus = " ".join(data['tweet'])

    plt.figure(figsize = (10,10))

    # Forming the word cloud
    wc = WordCloud(max_words = 100,
                    width = 200,
                    height = 100,
                    collocations = False).generate(email_corpus)

    # Plotting the wordcloud obtained above
    plt.title(f'WordCloud for {typ} emails.', fontsize = 15)
    plt.axis('off')
    plt.imshow(wc)
    plt.show()
    print()

plot_word_cloud(df[df['class']==2], typ='Neither')

###########################################

class_2 = df[df['class'] == 2]
class_1 = df[df['class'] == 1].sample(n=3500)
class_0 = df[df['class'] == 0]

balanced_df = pd.concat([class_0, class_0, class_0, class_1, class_2], axis=0)

plt.pie(balanced_df['class'].value_counts().values,
		labels=balanced_df['class'].value_counts().index,
		autopct='%1.1f%%')
plt.show()

##############################################

features = balanced_df['tweet']
target = balanced_df['class']

X_train, X_test, Y_train, Y_test = train_test_split(features,
												target,
												test_size=0.2,
												random_state=22)
X_train.shape, X_test.shape


Y_train = pd.get_dummies(Y_train)
Y_test = pd.get_dummies(Y_test)
Y_train.shape, Y_test.shape


max_words = 5000
max_len = 100

token = Tokenizer(num_words=max_words,
				lower=True,
				split=' ')

token.fit_on_texts(X_train)


# training the tokenizer
max_words = 5000
token = Tokenizer(num_words=max_words,
				lower=True,
				split=' ')
token.fit_on_texts(X_train)

#Generating token embeddings
Training_seq = token.texts_to_sequences(X_train)
Training_pad = pad_sequences(Training_seq,
							maxlen=max_len,  # Update here,
							padding='post',
							truncating='post')

Testing_seq = token.texts_to_sequences(X_test)
Testing_pad = pad_sequences(Testing_seq,
							maxlen=max_len,
							padding='post',
							truncating='post')
##############################

model = keras.models.Sequential([
	layers.Embedding(max_words, 32, input_length=max_len),
	layers.Bidirectional(layers.LSTM(16)),
	layers.Dense(512, activation='relu', kernel_regularizer='l1'),
	layers.BatchNormalization(),
	layers.Dropout(0.3),
	layers.Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
			optimizer='adam',
			metrics=['accuracy'])

model.summary()


######################

keras.utils.plot_model(
	model,
	show_shapes=True,
	show_dtype=True,
	show_layer_activations=True
)

########################################

from keras.callbacks import EarlyStopping, ReduceLROnPlateau

es = EarlyStopping(patience=3,
				monitor = 'val_accuracy',
				restore_best_weights = True)

lr = ReduceLROnPlateau(patience = 2,
					monitor = 'val_loss',
					factor = 0.5,
					verbose = 0)

history = model.fit(Training_pad, Y_train,
                    validation_data=(Testing_pad, Y_test),
                    epochs=10,
                    verbose=1,
                    batch_size=32,
                    callbacks=[lr, es])


history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()
plt.show()
# Evaluate LSTM model accuracy
loss, accuracy = model.evaluate(Testing_pad, Y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)
from sklearn.metrics import classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

y_pred = np.argmax(model.predict(Testing_pad), axis=-1)
plot_confusion_matrix(confusion_matrix(np.argmax(Y_test, axis=-1), y_pred))
###################
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools

# Define class labels
class_labels = ["Hate", "Offensive", "Neither"]

# Compute confusion matrix
y_pred = np.argmax(model.predict(Testing_pad), axis=-1)
cm = confusion_matrix(np.argmax(Y_test, axis=-1), y_pred)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else '.0f'  # Format as decimal fraction if normalize=True
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Plot confusion matrix
plot_confusion_matrix(cm, classes=class_labels, normalize=True)  # Set normalize=True to display decimal fractions
plt.show()
print(classification_report(np.argmax(Y_test, axis=-1), y_pred))
##############################
# Traning Results
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Get predictions for training set
y_train_pred = np.argmax(model.predict(Training_pad), axis=-1)

# Calculate training set accuracy
train_accuracy = accuracy_score(np.argmax(Y_train, axis=-1), y_train_pred)
print("Training Accuracy:", train_accuracy)
# Generate classification report for training set
train_report = classification_report(np.argmax(Y_train, axis=-1), y_train_pred)
print("Training Classification Report:")
print(train_report)
###############################
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools

# Define class labels
class_labels = ["Hate", "Offensive", "Neither"]

# Compute confusion matrix
y_train_pred = np.argmax(model.predict(Training_pad), axis=-1)
train_cm = confusion_matrix(np.argmax(Y_train, axis=-1), y_train_pred)

# Function to plot confusion matrix
def plot_confusion_matrix(train_cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        train_cm = train_cm.astype('float') / train_cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(train_cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else '.0f'  # Format as decimal fraction if normalize=True
    thresh = train_cm.max() / 2.
    for i, j in itertools.product(range(train_cm.shape[0]), range(train_cm.shape[1])):
        plt.text(j, i, format(train_cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if train_cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Plot confusion matrix
plot_confusion_matrix(train_cm, classes=class_labels, normalize=True)  # Set normalize=True to display decimal fractions
plt.show()