# 1.2 Glove


<a target="_blank" href="https://colab.research.google.com/github/G1-ABID-23-24/offensive-language-detection-2024/blob/main/1.1_GloVe.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Inicialmente se va a realizar un preprocesado de los datos, eliminando las palabras sin significado útil, los url y los signos de puntuación.

In [None]:
#Import libraries and upload the dataframe
import numpy as np
import pandas as pd
import spacy
import re
import string
from tqdm import tqdm
from nltk.tokenize import word_tokenize

import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers import BatchNormalization
from keras.callbacks import ReduceLROnPlateau,CSVLogger
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix,classification_report
import shutil


#If you don't have en_core_web_lg downloaded (stopword list)
#!python -m spacy download en_core_web_lg

df = pd.read_csv('./data/train.csv')
nlp = spacy.load('en_core_web_lg')
en_stopwords = nlp.Defaults.stop_words

In [None]:
#Function to correct spelling errors
def correct_spellings(text):
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(filter(None, corrected_text))
        
#Function to remove URLs
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

#Function to remove emojis from the text
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Function to remove punctuation
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

#Function to remove stopwords from the text
def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop:
        #and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text
    return cleanText

#df['text']=df['text'].apply(lambda x : correct_spellings(x))
df['text']=df['text'].apply(lambda x : remove_URL(x))
#df['text']=df['text'].apply(lambda x : remove_emoji(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))
df['text']=df['text'].apply(remove_stop_words)

Despues de esto, hacemos las transformaciones necesarias para usar la libreria de Glove.

In [None]:
x= df['text']
y=df['label']

texts = x
target = y

#tokenising the data
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)
#defining vocabulary length
vocab_length = len(word_tokenizer.word_index) + 1

def embed(text_data): 
    return word_tokenizer.texts_to_sequences(text_data)

longest_train = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

### Vectorización con GloVe

In [None]:
#Function to create a corpus for GloVe embedding
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in en_stopwords))]
        corpus.append(words)
    return corpus

corpus=create_corpus(df)

A continuación, comenzamos descargando el modelo más básico de Glove disponible, ya que no hará falta uno más complicado, despues comenzamos a adecuar nuestro modelo.

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip

embedding_dict={}
with open('glove.6B/glove.6B.100d.txt','r', encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
# Loading embedding_vectors of words which comes in Glove files other will be equated to 0
#defining embedding matrix shape
embedding_matrix = np.zeros((vocab_length, embedding_dim))
#creating embedding matrix
for word, index in word_tokenizer.word_index.items(): 
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
#splitting dataset
X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences, 
    target, 
    test_size=0.25
)

X_train, x_val, y_train, y_val = train_test_split(
    X_train, 
    y_train,
    test_size=0.1 )


Comenzamos probando nuestros datos con el algoritmo BiLSTM.

In [None]:
#defining glove bilstm model
def bilstm():
    model = Sequential()
    #adding embediing layer
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence))
    #adding Bi_lstm later
    model.add(Bidirectional(LSTM(
        length_long_sentence, 
        return_sequences = True, 
        recurrent_dropout=0.2)))
    model.add(GlobalMaxPool1D()) #globalmaxpooling_layer
    model.add(BatchNormalization()) #bath_normalisation
    model.add(Dropout(0.5)) #dropout_1
    model.add(Dense(length_long_sentence, activation = "relu")) #denselayer_1
    model.add(Dropout(0.5)) #dropout_2
    model.add(Dense(length_long_sentence, activation = "relu")) #denselayer_2
    model.add(Dropout(0.5)) #dropout_3
    model.add(Dense(3, activation = 'softmax')) #classification_layer
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

bilstm_model = bilstm()

#defining_class_weight for each class
weight_class1 = (1 / hate)*(total)/3.0 
weight_class2 = (1 / ofensive)*(total)/3.0
weight_class3 = (1 / neither)*(total)/3.0
class_weight = {0: weight_class1, 1: weight_class2, 2: weight_class3}


reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)

epoch_count=20
batch_size= 128


In [None]:
#running_model
history = bilstm_model.fit(
    X_train, 
    y_train, 
    epochs = epoch_count,
    batch_size = batch_size,
    validation_data = (x_val, y_val),
    verbose = 1,
    callbacks = [reduce_lr],
    class_weight=class_weight
)

#plotting graphs
def plot_learning_curves(history, arr):
    fig, ax = plt.subplots(1, 2, figsize=(20, 5))
    for idx in range(2):
        ax[idx].plot(history.history[arr[idx][0]])
        ax[idx].plot(history.history[arr[idx][1]])
        ax[idx].legend([arr[idx][0], arr[idx][1]],fontsize=17)
        ax[idx].set_xlabel('Loss ',fontsize=14)
        ax[idx].set_ylabel('Accuracy',fontsize=14)
        ax[idx].set_title(arr[idx][0] + ' X ' + arr[idx][1],fontsize=16)

plot_learning_curves(history, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])

#prediciting
preds= np.argmax(model.predict(X_test), axis=-1)
#printing classification_report & confusion_matrix
print(classification_report(y_test,preds ))
print(confusion_matrix(y_test, preds))

Ahora, con redes convolucionales.

In [None]:

filters= 32
kernel_size=2
hidden_dims= 128
    
def CNN():
    model = Sequential()
    #adding embedding layer
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence))
    # 2 CNN layer
    model.add(Conv1D(32,2,padding='valid', activation='relu')) #cnn_layer_1
    model.add(Conv1D(64,2,padding='valid',activation='relu')) #cnn_layer_2
    model.add(GlobalMaxPooling1D()) #globalmaxpooling_layer
    model.add(Dense(256, activation='relu')) #dense_layer
    model.add(Dropout(0.1)) #dropout_layer
    model.add(Dense(3, activation = 'softmax')) #classification layer
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

#builiding CNN model
model2=CNN()
 
#running mode
history2 = model2.fit(
    X_train, 
    y_train, 
    epochs = epoch_count,
    batch_size = batch_size,
    validation_data = (x_val, y_val),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint],
    class_weight=class_weight
)

#plotting graphs
plot_learning_curves(history2, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])
#predicting
pred2= np.argmax(model2.predict(X_test), axis=-1)
#printing reports
print(classification_report(y_test,pred2 ))
print(confusion_matrix(y_test, pred2))


Terminamos con MLP!

In [None]:
def MLP():
    model = Sequential()
    #embedding layer
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence))
    model.add(Flatten()) #flatten_layer
    model.add(Dense(512, activation='relu')) #dense_layer
    model.add(Dropout(0.2)) #dropout_layer
    model.add(Dense(3, activation = 'softmax'))#classification_layer
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

#building model
model3=MLP()
#running_model
history3 = model3.fit(
    X_train, 
    y_train, 
    epochs = epoch_count,
    batch_size = batch_size,
    validation_data = (x_val, y_val),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint],
    class_weight=class_weight
)

#plotting_graphs
plot_learning_curves(history3, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])
#predicting
pred3= np.argmax(model3.predict(X_test), axis=-1)

print(classification_report(y_test,pred3))
print(confusion_matrix(y_test, pred3))e