# 1.2 Glove


<a target="_blank" href="https://colab.research.google.com/github/G1-ABID-23-24/offensive-language-detection-2024/blob/main/1.1_GloVe.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [15]:
#Import libraries and upload the dataframe
import numpy as np
import pandas as pd
import spacy
import re
import string
from tqdm import tqdm
from nltk.tokenize import word_tokenize

#If you don't have en_core_web_lg downloaded (stopword list)
#!python -m spacy download en_core_web_lg

df = pd.read_csv('./data/train.csv')
nlp = spacy.load('en_core_web_lg')
en_stopwords = nlp.Defaults.stop_words

In [16]:
#Function to correct spelling errors
def correct_spellings(text):
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(filter(None, corrected_text))
        
#Function to remove URLs
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

#Function to remove emojis from the text
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Function to remove punctuation
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

#Function to remove stopwords from the text
def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop:
        #and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text
    return cleanText

#df['text']=df['text'].apply(lambda x : correct_spellings(x))
df['text']=df['text'].apply(lambda x : remove_URL(x))
#df['text']=df['text'].apply(lambda x : remove_emoji(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))
df['text']=df['text'].apply(remove_stop_words)

### Vectorización con GloVe

In [18]:
#Function to create a corpus for GloVe embedding
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in en_stopwords))]
        corpus.append(words)
    return corpus

corpus=create_corpus(df)

100%|██████████| 8148/8148 [00:00<00:00, 13534.79it/s]


In [22]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip

embedding_dict={}
with open('glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

"wget" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
"unzip" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.


FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.100d.txt'

In [None]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,50))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue

    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

In [None]:
X_train,X_val, y_train, y_val = train_test_split(tweet_pad,df.v1, test_size=.2, random_state=2)

print('Shape of train sequences: ',X_train.shape)
print('Shape of train labels: ',y_train.shape)
print("Shape of Validation sequences: ",X_val.shape)
print("Shape of Validation  labels: ",y_val.shape)

In [None]:
model=Sequential()

embedding_layer=Embedding(num_words,50,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding_layer)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))
model.add(tf.keras.layers.LSTM(32,return_sequences=True))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(16, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

In [None]:
optimzer=Adam(learning_rate=1e-4)
model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['acc'])
model.summary()

In [None]:
history=model.fit(X_train,y_train,batch_size=32,epochs=10,validation_data=(X_val,y_val),verbose=1)

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()

In [None]:
tsne = TSNE(n_components=2, random_state=0)
words =  list(embeddings_dict.keys())
vectors = [embeddings_dict[word] for word in words]
Y = tsne.fit_transform(vectors[:1000])
plt.scatter(Y[:, 0], Y[:, 1])

for label, x, y in zip(words, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
plt.show()

In [None]:
# Función para convertir palabras en vectores
def word_to_vec(word, model):
    try:
        return model[word]
    except KeyError:
        # Si la palabra no está en el vocabulario, retorna un vector de ceros
        return [0] * model.vector_size

# Aplica la función a cada palabra en la columna 'text'
df['vector'] = df['text'].apply(lambda x: word_to_vec(x, glove_model))