In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt


tweets = pd.read_csv("Data_With_Profiles.csv")
labels = tweets['Label']

# Preprocess the tweets (optional)
def preprocess_text(text):
    return text.lower()

def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_val


preprocessed_tweets = [preprocess_text(tweet) for tweet in tweets['Tweet_Text']]

# Tokenize and pad the tweets
max_vocab_size = 10000
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(preprocessed_tweets)
sequences = tokenizer.texts_to_sequences(preprocessed_tweets)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Convert the labels to one-hot encoded vectors
num_classes = len(np.unique(labels))
y_train_one_hot = to_categorical(y_train, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test, num_classes=num_classes)

# Create a GRU model using Keras
embedding_dim = 32

model = Sequential([
    Embedding(max_vocab_size, embedding_dim, input_length=max_sequence_length),
    GRU(64, return_sequences=True),
    GRU(32),
    Dense(num_classes, activation='softmax')
])
recall = Recall()
precision = Precision()

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', precision, recall, f1_score])


# Train the model
history = model.fit(X_train, y_train_one_hot, epochs=10, validation_data=(X_test, y_test_one_hot))

loss_values = history.history['loss']
val_loss_values = history.history['val_loss']
accuracy_values = history.history['accuracy']
val_accuracy_values = history.history['val_accuracy']

# Evaluate the model
# Assuming X_test and y_test_one_hot are already prepared
loss, accuracy, precision, recall, f1 = model.evaluate(X_test, y_test_one_hot)
print("Loss:", loss)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

In [None]:
#for validation loss and accuracy results visualization

epochs = range(1, len(loss_values)+1)
plt.plot(epochs, loss_values, 'b', color='red', label='Train')
plt.plot(epochs, val_loss_values, 'b', color='blue',label='Test')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.plot(epochs, accuracy_values, 'b', color='red', label='Train')
plt.plot(epochs, val_accuracy_values, 'b', color='blue', label='Test')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# testing of trained model over some random tweets not in the dataset

new_tweets = [
    "Tum achi lg rhi ho",
    "Tum Pathan ho tumhen kya pta aqal kya hoti ha", 
    "yahoodi saazish aurat march", 
    "Tum hindu ho, tm log hote he haraami ho",
    "lo ji qadiani yahoodi agent any aik dafa phir baazi maarli musalmaan apni apni siyaasat kartay reeh gaey", 
    "hijra ha ye", 
    "randdi ki olaad",
    "chal tu mat bhaunk",
    "han theek ha", 
    "bakwas band kr tu bhi chali ja", 
    "ye chutiyaapa myth hi faila raha ha", 
    "oye bahanchod yih kya chutiyaapa h", 
    "you people just used shehnaz in whole season fck off bc"
]

# Preprocess and tokenize the new tweets
preprocessed_new_tweets = [preprocess_text(tweet) for tweet in new_tweets]
new_sequences = tokenizer.texts_to_sequences(preprocessed_new_tweets)
padded_new_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)

# Make predictions
predictions = model.predict(padded_new_sequences)

# Get the predicted class labels
predicted_labels = np.argmax(predictions, axis=1)
print(predicted_labels)
classes = {
  0: "Neutral",
  1: "Racism",
  2: "Abusive/Offensive", 3:"Sexism" , 4:"Relegious Hate" 
}

# Print the predictions
for i, tweet in enumerate(new_tweets):
    print(f"Tweet: {tweet}")
    print(f"Predicted label: {classes[predicted_labels[i]]}")

In [None]:
#for abuser profile identification


csv_file = 'Data_With_Profiles.csv'

# Create an empty dictionary to store the user-tweet data
user_tweets = {}

# Create an empty dictionary to store the abuser data
abuser_profile = {}

# Read the CSV file
with open(csv_file, 'r', encoding='utf-8') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Skip header row if present
    next(csv_reader, None)

    # Iterate through the rows of the file
    for row in csv_reader:
        user = row[2]
        tweet = row[0]
        
        # Check if the user already exists in the dictionary
        if user in user_tweets:
            # Append the tweet to the existing list
            user_tweets[user].append(tweet)
        else:
            # Create a new list and add the tweet
            user_tweets[user] = [tweet]
            

# Iterate through the user_tweets dictionary and predict if the tweets are bullying or not
for user, tweets in user_tweets.items():
    b_tweets = 0
    nb_tweets = 0
    preprocessed_new_tweets = [preprocess_text(tweet) for tweet in tweets]
    new_sequences = tokenizer.texts_to_sequences(tweets)
    padded_new_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)
    # Make predictions
    predictions = model.predict(padded_new_sequences)

    # Get the predicted class labels
    predicted_labels = np.argmax(predictions, axis=1)
    print(predicted_labels)
    for i, tweet in enumerate(tweets):
        if predicted_labels[i] == 0:
            nb_tweets = nb_tweets + 1
        elif predicted_labels[i] == 2 or predicted_labels[i] == 3:
            b_tweets = b_tweets + 1
    print(f"User: {user}, No of bullying tweets: {b_tweets}, No of Non Bullying: {nb_tweets}")
    if nb_tweets > 1:
        abuser_profile[user] = round((b_tweets/(nb_tweets + b_tweets)) * 100, 2)
print(abuser_profile)

busers=[]
susers=[]
nusers=[]
for user in abuser_profile:
    count = 0
    bcount = 0
    scount = 0
    if abuser_profile[user] >= 60:
        bcount = bcount + 1;
        busers.append(user)
    elif abuser_profile[user] < 60 and abuser_profile[user] >= 50:
        scount = scount + 1;
        susers.append(user)
    elif abuser_profile[user] < 50:
        count = count + 1
        nusers.append(user)

print(f"No of Normal Users: {count}, No of Bullying Users: {bcount} , No of suspected Users:{scount}")