In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences  
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Flatten, Embedding, Dropout
from tensorflow.keras.layers import Conv1D, MaxPool1D, GlobalMaxPooling1D, MaxPooling1D
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import preprocess_kgptalkie as ps
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from keras.utils import np_utils
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

##########################
df = pd.read_csv('labeled_data.csv',index_col=0)
df
##########################
vc = df['class'].value_counts()
index = list(vc.index)
count = min(vc.values)
df_bal = pd.DataFrame()  # Initialize an empty DataFrame
for i in index:
    temp = df[df['class'] == i].sample(count)
    df_bal = pd.concat([df_bal, temp], ignore_index=True)

df = df_bal.copy()
df
##########################
labels = df['class'].value_counts().index.tolist()
values = df['class'].value_counts().values.tolist()
new_labels = []

for index, label in enumerate(labels):
    if label == 0:
        new_labels.append("Hate speech")
    elif label == 1:
        new_labels.append("Offensive language")
    else:
        new_labels.append("Neither")

plt.pie(values, labels=new_labels, autopct='%1.1f%%')
plt.show()
##########################
from bs4 import BeautifulSoup
def remove_html_tags(x):
    return BeautifulSoup(x, 'html.parser').get_text().strip()
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x
df['tweet'] = df['tweet'].apply(lambda x: remove_html_tags(x))
df['tweet'] = df['tweet'].apply(lambda x: get_clean(x))
##########################
# convert our text data in form of list
text = df['tweet'].tolist()
# transfer each text in texts to a sequeance of integer
token = Tokenizer()
token.fit_on_texts(text)
vocab_size = len(token.word_counts) + 1
encoded_text = token.texts_to_sequences(text)
print(encoded_text)

max_length = 120
X = pad_sequences(encoded_text, maxlen=max_length, padding = 'post')
y = df['class']
y = np_utils.to_categorical(df['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)
X_train.shape, X_test.shape
vec_size = 300
model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=max_length))

model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(3, activation='softmax'))
model.compile(optimizer=Adam(learning_rate=0.001), loss = 'categorical_crossentropy', metrics = ['accuracy'])
history=model.fit(X_train, y_train, epochs = 2, validation_data=(X_test, y_test), shuffle = True)
# Evaluate CNN model accuracy
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

y_pred = np.argmax(model.predict(X_test), axis=-1)
plot_confusion_matrix(confusion_matrix(np.argmax(y_test, axis=-1), y_pred))
print(classification_report(np.argmax(y_test, axis=-1), y_pred))

import itertools

# Define class labels
class_labels = ["Hate", "Offensive", "Neither"]

# Compute confusion matrix
y_pred = np.argmax(model.predict(X_test), axis=-1)
cm = confusion_matrix(np.argmax(y_test, axis=-1), y_pred)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else '.0f'  # Format as decimal fraction if normalize=True
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Plot confusion matrix
plot_confusion_matrix(cm, classes=class_labels, normalize=True)  # Set normalize=True to display decimal fractions
plt.show()

# Training Results

# Get predictions for training set
y_train_pred = np.argmax(model.predict(X_train), axis=-1)

# Calculate training set accuracy
train_accuracy = accuracy_score(np.argmax(y_train, axis=-1), y_train_pred)
print("Training Accuracy:", train_accuracy)

# Generate classification report for training set
train_report = classification_report(np.argmax(y_train, axis=-1), y_train_pred)
print("Training Classification Report:")
print(train_report)


# Generate confusion matrix for training set

# Define class labels
class_labels = ["Hate", "Offensive", "Neither"]

# Compute confusion matrix
y_train_pred = np.argmax(model.predict(X_train), axis=-1)
train_cm = confusion_matrix(np.argmax(y_train, axis=-1), y_train_pred)

# Function to plot confusion matrix
def plot_confusion_matrix(train_cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        train_cm = train_cm.astype('float') / train_cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(train_cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else '.0f'  # Format as decimal fraction if normalize=True
    thresh = train_cm.max() / 2.
    for i, j in itertools.product(range(train_cm.shape[0]), range(train_cm.shape[1])):
        plt.text(j, i, format(train_cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if train_cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Plot confusion matrix
plot_confusion_matrix(train_cm, classes=class_labels, normalize=True)  # Set normalize=True to display decimal fractions
plt.show()

# Classification Hate & Offensive to 3 levels

# Define offensive words and their corresponding offensive levels
offensive_words = {
    'very_offensive': ['abortion', 'ass', 'bastard', 'bitch', 'cunt', 'damn', 'faggot', 'nigger', 'rape', 'slut', 'whore', 'white power', 'you suck', 'shut up', 'suck my dick', 'suck it', 'fuck you', 'fuck off', 'go to hell', 'die'],
    'moderately_offensive': ['idiot', 'moron', 'retard', 'stupid', 'dumb', 'loser', 'gay', 'ugly', 'fat', 'douchebag', 'creep', 'pussy', 'dick', 'piss off', 'leave me alone', 'get lost', 'not interested', 'no thanks', 'whatever', 'who cares'],
    'little_offensive': ['jerk', 'fool', 'wimp', 'weenie', 'dork', 'nerd', 'geek', 'awkward', 'weird', 'annoying', 'bothersome', 'irritating', 'hate', 'dislike', 'boring', 'tedious', 'not fun', 'blah', 'whatever', 'meh']
}

# Define hate speech words and their corresponding hate speech levels
hate_speech_words = {
    'danger_hate_speech': ['kill', 'hate', 'racist', 'terrorist', 'genocide','niggah','nigger', 'exterminate', 'eliminate', 'eradicate', 'annihilate', 'lynch', 'hang', 'burn', 'shoot', 'murder', 'rape'],
    'moderately_hate_speech': ['bigot', 'prejudice', 'homophobia', 'islamophobia', 'antisemitism', 'discrimination', 'intolerance', 'hatred', 'hostility', 'oppression', 'supremacist', 'nationalist', 'segregation', 'separatism'],
    'poor_hate_speech': ['offend', 'disrespect', 'insult', 'belittle', 'mock', 'ridicule', 'scorn', 'taunt', 'tease', 'joke', 'laugh', 'smear', 'troll', 'bully', 'harass', 'threaten']
}

def classify_Hate_tweet(tweet, label, hate_speech_words):
    if label == 0:
        for level, words in hate_speech_words.items():
            for word in words:
                if word in tweet.lower():
                    if level == 'danger_hate_speech':
                        return level, "Block account completely"
                    elif level == 'moderately_hate_speech':
                        return level, "Block account for a while"
                    elif level == 'poor_hate_speech':
                        return level, "Send a reminder"
        return 'Not a hate speech tweet', None  # Return None for action if no match is found

def classify_Off_tweet(tweet, label, offensive_words):
    if label == 1:
        for level, words in offensive_words.items():
            for word in words:
                if word in tweet.lower():
                    if level == 'very_offensive':
                        return level, "Remove account"
                    elif level == 'moderately_offensive':
                        return level, "Block the ability to edit posts"
                    elif level == 'little_offensive':
                        return level, "Block account for a while"
        return 'Not an offensive tweet', None  # Return None for action if no match is found
    
##########################

# Testing with custom date

def get_encoded(x):
  x = remove_html_tags(x)
  x = get_clean(x)
  x = token.texts_to_sequences([x])
  x = pad_sequences(x, maxlen=max_length, padding = 'post')
  return x


np.argmax(model.predict(get_encoded(x)), axis=-1)
result = np.argmax(model.predict(get_encoded(x)), axis=-1)[0]
print(result)


hate_speech_result = classify_Hate_tweet(x, result, hate_speech_words)
offensive_result = classify_Off_tweet(x, result, offensive_words)

if hate_speech_result is not None:
    hate_speech_level, hate_speech_action = hate_speech_result
    print("Hate Speech Level:", hate_speech_level)
    print("Action to Take:", hate_speech_action)

if offensive_result is not None and offensive_result[0] != 'Not an offensive tweet':
    offensive_level, offensive_action = offensive_result
    print("Offensive Level:", offensive_level)
    print("Action to Take:", offensive_action)

if hate_speech_result is None and (offensive_result is None or offensive_result[0] == 'Not an offensive tweet'):
    print("No action needed")