In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Simulated datasets (replace with your actual datasets)
# Dataset1: Hate speech (0: non-hate, 1: hate)
dataset1 = pd.read_csv('MultiLanguageTrainDataset.csv')
dataset1 = dataset1[dataset1['language'] == 2]
dataset1 = dataset1.drop(columns=['Unnamed: 0'])
dataset1 = dataset1.drop(columns=['language'])
print(dataset1)

dataset2 = pd.read_csv('HateSpeechDatasetBalanced.csv')
dataset2.rename(columns={'Content': 'text'}, inplace=True)
dataset2.rename(columns={'Label': 'label'}, inplace=True)
print(dataset2)



# Dataset3: Sarcasm (0: non-sarcasm, 1: sarcasm)
dataset3 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json',lines=True)
dataset3 = dataset3.drop(columns=['article_link'])
dataset3.rename(columns={'headline': 'text'}, inplace=True)
dataset3.rename(columns={'is_sarcastic': 'label'}, inplace=True)
dataset3 = dataset3[['text', 'label']]
print(dataset3)

                                                    text  label
4161   South Africa Charges 4 Suspected of Plotting t...    0.0
4162                                   terrorist leftist    1.0
4163   RT HanzalaOfficial: India your link with ISIS ...    0.0
4164   Seize Pope & Rothschild who owns Fed Reserve  ...    0.0
4165   Saudi-led bombing in Yemen; ISIS abuses; Turke...    0.0
...                                                  ...    ...
57302  Russian Forces Continue March into Syrian Isla...    0.0
57303  EL #Califato es la reaparici  n de una ideolog...    0.0
57304  @SilentSecretMan reporte toi    la s  rie "The...    1.0
57305  @Dabiq_Warrior @Totenleserin Countryside will ...    1.0
57306  @A_Moon_Banana @Mai_svg see I could say the sa...    0.0

[53146 rows x 2 columns]
                                                     text  label
0       denial of normal the con be asked to comment o...      1
1       just by being able to tweet this insufferable ...      1
2       tha

In [24]:


# Combine datasets
# Map labels: 0 (Normal Speech), 1 (Hate Speech), 2 (Sarcasm)
dataset1['label'] = dataset1['label'].map({0: 0, 1: 1})  # Non-hate -> Normal, Hate -> Hate
dataset2['label'] = dataset2['label'].map({0: 0, 1: 1})  # Non-hate -> Normal, Hate -> Hate
dataset3['label'] = dataset3['label'].map({0: 0, 1: 2})  # Non-sarcasm -> Normal, Sarcasm -> Sarcasm

# Concatenate datasets
data = pd.concat([dataset1, dataset2, dataset3], ignore_index=True)

# Shuffle the data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Parameters
max_words = 10000  # Maximum number of words to consider
max_sequence_length = 100  # Maximum length of sequences
embedding_dim = 100  # Dimension of word embeddings

# Tokenization
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(data['label'])
labels = to_categorical(labels, num_classes=3)  # One-hot encode for 3 classes

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build model
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_sequence_length),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # 3 classes: Normal, Hate, Sarcasm
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate model
test_predictions = model.predict(X_test)
predictions_classes = np.argmax(test_predictions, axis=1)
test_labels = np.argmax(y_test, axis=1)

# Classification report
class_names = ['Normal Speech', 'Hate Speech', 'Sarcasm']
print("\nClassification Report:")
print(classification_report(test_labels, predictions_classes, target_names=class_names))

# Modified prediction function to show all class probabilities
def predict_input_with_percentage(text, tokenizer, model, max_sequence_length=100):
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequences, maxlen=max_sequence_length)

    # Predict probabilities for each class
    prediction = model.predict(padded_sequence)[0]

    # Get the class with the highest probability
    predicted_class = np.argmax(prediction)

    # Get probabilities for all classes
    probabilities = prediction * 100

    return predicted_class, probabilities

# Example input texts
input_texts = [
    "I am so happy with the results!",
    "I hate you!",
    "I hate Chinese people!",
    "That movie was just amazing, I totally get the sarcasm!"
]

# Test each input and display probabilities for all classes
print("\nPredictions for Input Texts:")
for text in input_texts:
    predicted_class, probabilities = predict_input_with_percentage(text, tokenizer, model, max_sequence_length)
    print(f"\nText: '{text}'")
    print(f"Predicted: {class_names[predicted_class]} (Class {predicted_class})")
    for class_name, prob in zip(class_names, probabilities):
        print(f"{class_name}: {prob:.2f}%")



Epoch 1/5
[1m16158/16158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 9ms/step - accuracy: 0.7953 - loss: 0.4482 - val_accuracy: 0.8435 - val_loss: 0.3484
Epoch 2/5
[1m16158/16158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 8ms/step - accuracy: 0.8614 - loss: 0.3166 - val_accuracy: 0.8577 - val_loss: 0.3255
Epoch 3/5
[1m16158/16158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 8ms/step - accuracy: 0.8851 - loss: 0.2690 - val_accuracy: 0.8635 - val_loss: 0.3176
Epoch 4/5
[1m16158/16158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 9ms/step - accuracy: 0.9032 - loss: 0.2304 - val_accuracy: 0.8633 - val_loss: 0.3290
Epoch 5/5
[1m16158/16158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 8ms/step - accuracy: 0.9207 - loss: 0.1939 - val_accuracy: 0.8632 - val_loss: 0.3412
[1m5050/5050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step

Classification Report:
               precision    recall  f1-score   support

Normal 

In [25]:
joblib.dump(model, 'english_model_collab.pkl')

['english_model_collab.pkl']

In [26]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer



# Save the tokenizer as a .pkl file
with open('tokenizer_english_collab.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Tokenizer saved to tokenizer.pkl")

Tokenizer saved to tokenizer.pkl
