<a href="https://colab.research.google.com/github/07423314796/NLPCW2/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#importing libraries
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, GRU, Bidirectional, Attention, Input, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

# Load dataset
Data = pd.read_csv("/content/olid-training-v1.0.tsv", index_col=False, sep="\t")
New_data = pd.DataFrame(Data)

# Preprocess data
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

New_data['clean_text'] = New_data['tweet'].apply(preprocess_text)

# Encode labels
label_encoder_a = LabelEncoder()
label_encoder_b = LabelEncoder()
label_encoder_c = LabelEncoder()

New_data['subtask_a'] = label_encoder_a.fit_transform(New_data['subtask_a'].fillna('NOT'))
New_data['subtask_b'] = label_encoder_b.fit_transform(New_data['subtask_b'].fillna('NONE'))
New_data['subtask_c'] = label_encoder_c.fit_transform(New_data['subtask_c'].fillna('NONE'))

# Define features and labels
X = New_data['clean_text']
y_a = New_data['subtask_a']
y_b = New_data['subtask_b']
y_c = New_data['subtask_c']

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=100)

# Split the data
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_a, test_size=0.2, random_state=42)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X, y_b, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_c, test_size=0.2, random_state=42)

# Define model architecture LSTM
def build_model(output_units, output_activation):
    input_layer = Input(shape=(100,))
    embedding_layer = Embedding(input_dim=5000, output_dim=128, input_length=100)(input_layer)
    lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
    attention_layer = Attention()([lstm_layer, lstm_layer])
    context_vector = tf.reduce_sum(attention_layer, axis=1)
    output_layer = Dense(output_units, activation=output_activation)(context_vector)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Build models for each subtask
model_a = build_model(output_units=len(label_encoder_a.classes_), output_activation='softmax')
model_b = build_model(output_units=len(label_encoder_b.classes_), output_activation='softmax')
model_c = build_model(output_units=len(label_encoder_c.classes_), output_activation='softmax')

# Train and evaluate models
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)
    y_pred = np.argmax(model.predict(X_test), axis=-1)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f}")
    return accuracy, f1

print("Subtask A Results:")
results_a = train_and_evaluate_model(model_a, X_train_a, X_test_a, y_train_a, y_test_a)
print("\nSubtask B Results:")
results_b = train_and_evaluate_model(model_b, X_train_b, X_test_b, y_train_b, y_test_b)
print("\nSubtask C Results:")
results_c = train_and_evaluate_model(model_c, X_train_c, X_test_c, y_train_c, y_test_c)

# Function to detect offensive language
def detect_offensive_language(text):
    clean_text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([clean_text])
    padded_sequence = pad_sequences(sequence, maxlen=100)

    subtask_a_prediction = np.argmax(model_a.predict(padded_sequence), axis=-1)[0]
    if subtask_a_prediction == 0:  # 'NOT'
        return None

    subtask_b_prediction = np.argmax(model_b.predict(padded_sequence), axis=-1)[0]
    subtask_c_prediction = np.argmax(model_c.predict(padded_sequence), axis=-1)[0]

    return {
        'offensive': True,
        'insult_type': label_encoder_b.inverse_transform([subtask_b_prediction])[0],
        'target': label_encoder_c.inverse_transform([subtask_c_prediction])[0]
    }

# Example usage
text = input("Enter a text: ")
result = detect_offensive_language(text)

if result is not None:
    if result['offensive']:
        print(f"The text is offensive.")
        print(f"Insult type: {result['insult_type']}")
        print(f"Target: {result['target']}")
    else:
        print("The text is not offensive.")
else:
    print("The text is not offensive.")


Subtask A Results:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.72, F1 Score: 0.72

Subtask B Results:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.72, F1 Score: 0.70

Subtask C Results:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.71, F1 Score: 0.69
Enter a text: I'm waiting for gun control advocates to add this to their list of school shootings.
The text is not offensive.


In [13]:
# CNN Model
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

Data = pd.read_csv("/content/olid-training-v1.0.tsv", index_col=False, sep="\t")
New_data = pd.DataFrame(Data)

# Preprocess data
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

New_data['clean_text'] = New_data['tweet'].apply(preprocess_text)

X = New_data['clean_text']
y_a = New_data['subtask_a']
y_b = New_data['subtask_b']
y_c = New_data['subtask_c']


# Tokenize and pad sequences
max_vocab_size = 5000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size)
# Fit the tokenizer on the text column (X) only
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)


# Encode the labels
label_encoder_a = LabelEncoder()
label_encoder_b = LabelEncoder()
label_encoder_c = LabelEncoder()

y_a_encoded = label_encoder_a.fit_transform(y_a)
y_b_encoded = label_encoder_b.fit_transform(y_b)
y_c_encoded = label_encoder_c.fit_transform(y_c)

# Split the data
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_padded, y_a_encoded, test_size=0.2, random_state=42)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_padded, y_b_encoded, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_padded, y_c_encoded, test_size=0.2, random_state=42)

# Define model architecture for CNN
def build_cnn_model(output_units, output_activation):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_sequence_length)(input_layer)
    conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
    pooling_layer = MaxPooling1D(pool_size=2)(conv_layer)
    flatten_layer = Flatten()(pooling_layer)
    output_layer = Dense(output_units, activation=output_activation)(flatten_layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define a function to train and evaluate the model
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Accuracy: {accuracy:.2f}')
    return history, loss, accuracy

# Build CNN models for each subtask
cnn_model_a = build_cnn_model(output_units=len(label_encoder_a.classes_), output_activation='softmax')
cnn_model_b = build_cnn_model(output_units=len(label_encoder_b.classes_), output_activation='softmax')
cnn_model_c = build_cnn_model(output_units=len(label_encoder_c.classes_), output_activation='softmax')

# Train and evaluate the models
print("CNN Subtask A Results:")
cnn_results_a = train_and_evaluate_model(cnn_model_a, X_train_a, X_test_a, y_train_a, y_test_a)

print("\nCNN Subtask B Results:")
cnn_results_b = train_and_evaluate_model(cnn_model_b, X_train_b, X_test_b, y_train_b, y_test_b)

print("\nCNN Subtask C Results:")
cnn_results_c = train_and_evaluate_model(cnn_model_c, X_train_c, X_test_c, y_train_c, y_test_c)


CNN Subtask A Results:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Accuracy: 0.73

CNN Subtask B Results:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Accuracy: 0.72

CNN Subtask C Results:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Accuracy: 0.70


In [12]:
#RNN
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Load and preprocess data
Data = pd.read_csv("/content/olid-training-v1.0.tsv", index_col=False, sep="\t")
New_data = pd.DataFrame(Data)

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

New_data['clean_text'] = New_data['tweet'].apply(preprocess_text)

X = New_data['clean_text']
y_a = New_data['subtask_a']
y_b = New_data['subtask_b']
y_c = New_data['subtask_c']

# Tokenize and pad sequences
max_vocab_size = 5000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

# Encode the labels
label_encoder_a = LabelEncoder()
label_encoder_b = LabelEncoder()
label_encoder_c = LabelEncoder()

y_a_encoded = label_encoder_a.fit_transform(y_a)
y_b_encoded = label_encoder_b.fit_transform(y_b)
y_c_encoded = label_encoder_c.fit_transform(y_c)

# Split the data
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_padded, y_a_encoded, test_size=0.2, random_state=42)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_padded, y_b_encoded, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_padded, y_c_encoded, test_size=0.2, random_state=42)

# Define model architecture for RNN
def build_rnn_model(output_units, output_activation):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_sequence_length)(input_layer)
    lstm_layer = LSTM(128, return_sequences=False)(embedding_layer)
    output_layer = Dense(output_units, activation=output_activation)(lstm_layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define a function to train and evaluate the model
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Accuracy: {accuracy:.2f}')
    return history, loss, accuracy

# Build RNN models for each subtask
rnn_model_a = build_rnn_model(output_units=len(label_encoder_a.classes_), output_activation='softmax')
rnn_model_b = build_rnn_model(output_units=len(label_encoder_b.classes_), output_activation='softmax')
rnn_model_c = build_rnn_model(output_units=len(label_encoder_c.classes_), output_activation='softmax')

# Train and evaluate the models
print("RNN Subtask A Results:")
rnn_results_a = train_and_evaluate_model(rnn_model_a, X_train_a, X_test_a, y_train_a, y_test_a)

print("\nRNN Subtask B Results:")
rnn_results_b = train_and_evaluate_model(rnn_model_b, X_train_b, X_test_b, y_train_b, y_test_b)

print("\nRNN Subtask C Results:")
rnn_results_c = train_and_evaluate_model(rnn_model_c, X_train_c, X_test_c, y_train_c, y_test_c)


RNN Subtask A Results:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Accuracy: 0.73

RNN Subtask B Results:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Accuracy: 0.71

RNN Subtask C Results:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Accuracy: 0.72
