# Static Word Embeddings for Text Representation  


This notebook explores static word embeddings for text representation in English and Spanish. It preprocesses tweet datasets by removing URLs, hashtags, and stopwords before tokenizing the text.

Pre-trained embeddings (Word2Vec, FastText, GloVe) are used to compute sentence representations.

The cosine similarity between tweet embeddings is calculated to identify the most semantically similar sexist and non-sexist tweets.

In [1]:
from nltk.corpus import stopwords
!pip install -U gensim
!pip install -U nltk
!pip install -U fasttext



In [2]:
import numpy as np
import fasttext.util
import gensim
import gensim.downloader as api
from gensim.models.keyedvectors import KeyedVectors
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("punkt_tab")
# nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load both corpora

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

# df = {
#    "english": "drive/MyDrive/EXIST2024_EN_examples.csv",
#    "spanish": "drive/MyDrive/EXIST2024_ES_examples.csv"
# }


path_english = "drive/MyDrive/EXIST2024_EN_examples.csv"
path_spanish = "drive/MyDrive/EXIST2024_ES_examples.csv"

df = {
    "english": pd.read_csv(path_english, sep="\t"),
    "spanish": pd.read_csv(path_spanish, sep="\t")
}
spanish_data = df["spanish"]
english_data = df["english"]

In [5]:
web_re = re.compile(r"https?:\/\/[^\s]+", re.U)
user_re = re.compile(r"(@\w+\-?(?:\w+)?)", re.U)
hashtag_re = re.compile(r"(#\w+\-?(?:\w+)?)", re.U)

stopw = {
    "english": nltk.corpus.stopwords.words("english"),
    "spanish": nltk.corpus.stopwords.words("spanish")
}

def preprocess(text):
    text = web_re.sub("", text)
    text = user_re.sub("", text)
    text = hashtag_re.sub("", text)
    text = text.lower()
    return text

def tokenize(text_list, lang="english"):
    token_list = []
    for text in text_list:
        text = preprocess(text)
        tokens = word_tokenize(text, language=lang)
        tokens = [word for word in tokens if word.isalnum() and word not in stopw[lang]]
        token_list.append(tokens)
    return token_list

tokenized_text = {
    "english": tokenize(df["english"]["text"], "english"),
    "spanish": tokenize(df["spanish"]["text"], "spanish")
}

## Text representation using static embeddings

ENGLISH

- word2vec-google-news-300 (using Gemini)
- fasttext-wiki-news-subwords-300 (using Gemini)
- glove-wiki-gigaword-300 (using Gemini)

SPANISH
- Fasttext (https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz) (using Gemini)

### Load the models

In [7]:
import gensim.downloader as api
info = api.info()

### Compute static word-embeddings representation of the tweets

In [8]:
def gensim_sentence_rep(tokens, model):
    avg_vec = np.zeros(model.vector_size)
    total_w = 0
    for token in tokens:
      if token in model:
        avg_vec += model.get_vector(token)
        total_w += 1

    if total_w == 0:
      return avg_vec

    return avg_vec / total_w

In [10]:
def get_embeddings(models, language):
    embeds = {name: [] for model, name in models}

    for model, name in models:
      for tokens in tokenized_text[language]:
        embeds[name].append(gensim_sentence_rep(tokens, model))

    return embeds

## Compute cosine similarities

In [11]:
def find_closest_similarity(model_embed, tweets, is_sexist):
  similarity = np.round(cosine_similarity(model_embed, model_embed), 4)

  tri_upper_indices = np.triu_indices_from(similarity, k=1)
  max_index = np.argmax(similarity[tri_upper_indices])
  tweet_idx1, tweet_idx2 = tri_upper_indices[0][max_index], tri_upper_indices[1][max_index]

  label = "Yes" if is_sexist else "NO"
  print(f"label: {label}\n sentence1: {tweets.iloc[tweet_idx1]['text']} \n --------------------")
  print(f"sentence2: {tweets.iloc[tweet_idx2]['text']} \n distance: {similarity[tweet_idx1, tweet_idx2]}\n")


## Show results

In [12]:
def show_results(tweets, embeds):
  for name, model_embed in embeds.items():
    tweets_nonsexist = tweets[tweets["label"] == "NO"].reset_index(drop=True)
    tweets_sexist = tweets[tweets["label"] == "YES"].reset_index(drop=True)

    embeds_sexist = np.array([model_embed[i] for i in tweets[tweets["label"] == "YES"].index.to_list()])
    embeds_nonsexist = np.array([model_embed[i] for i in tweets[tweets["label"] == "NO"].index.to_list()])

    print(f"{name}\n======\n")
    for tweets, is_sexist, embeddings in [(tweets_nonsexist, False, embeds_nonsexist), (tweets_sexist, True, embeds_sexist)]:
        find_closest_similarity(embeddings, tweets, is_sexist)

In [13]:
w2v300 = api.load("word2vec-google-news-300")




In [14]:
# models = [(w2v300, "w2v300"), (ftsub300, "ftsub300"), (glwiki300, "glwiki300")]
models = [(w2v300, "w2v300")]
embeds = get_embeddings(models, "english")
show_results(english_data, embeds)

w2v300

label: NO
 sentence1: @BLEEDTHISWAY replay free woman breebylon &gt;&gt;&gt; Flop this way 
 --------------------
sentence2: replay&gt;alice&gt;babylon&gt;free woman https://t.co/WCEqeUxdtC 
 distance: 0.9255

label: Yes
 sentence1: @WeaponizedRage Aerosmith in 1987: "Dude looks like a lady" 
 --------------------
sentence2: Dude does not look like a lady! https://t.co/C62JmKSzy0 
 distance: 0.9614



In [15]:
ftsub300 = api.load("fasttext-wiki-news-subwords-300")



In [16]:
models = [(ftsub300, "ftsub300")]
embeds = get_embeddings(models, "english")
show_results(english_data, embeds)

ftsub300

label: NO
 sentence1: @ChileMATD @lotusmusica @lollapaloozacl Free woman #MARINAEnChile2022 #SideshowParaMARINA 
 --------------------
sentence2: You have the right to be a free woman @antonioguterres @mbachelet @EmmanuelMacron @UNESCO @amnesty @hrw https://t.co/ftQTwQ4izi 
 distance: 0.9541

label: Yes
 sentence1: didn’t have to do Ethan like that tho…i hate women 😒 https://t.co/Zf5y9fsVW2 
 --------------------
sentence2: @mehrospace @hieiishere @greenyankeemanc @RealBetyCardens @LAPDHQ I hate women like you🤣 
 distance: 0.9678



In [17]:
glwiki300 = api.load("glove-wiki-gigaword-300")



In [18]:
models = [(glwiki300, "glwiki300")]
embeds = get_embeddings(models, "english")
show_results(english_data, embeds)

glwiki300

label: NO
 sentence1: @TarekFatah @RashidaTlaib @Ilhan Always playing the victim card. 
 --------------------
sentence2: Where dvmbses? I don't see anything, are you playing the victim card again? 🥱 https://t.co/UCk2kQuWda 
 distance: 0.939

label: Yes
 sentence1: @lkmeenha we can’t even have a day without women making it about themselves 🙄 
 --------------------
sentence2: @BigDILF01 Can’t go a day without women womening 
 distance: 0.9474



In [19]:
ftes300 = KeyedVectors.load_word2vec_format("https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz", binary=False)

In [20]:
models = [(ftes300, "ftes300")]
embeds = get_embeddings(models, "spanish")
show_results(spanish_data, embeds)

ftes300

label: NO
 sentence1: @rufinelix's account is temporarily unavailable because it violates the Twitter Media Policy. Learn more. 
 --------------------
sentence2: @Moreno19841's account is temporarily unavailable because it violates the Twitter Media Policy. Learn more. 
 distance: 1.0

label: Yes
 sentence1: ☀️ Los hombres también sufren depresión postparto... https://t.co/sAdzd9LUrc 
 --------------------
sentence2: 🍏 Los hombres también sufren depresión postparto... https://t.co/OVNEvgr0ZC 
 distance: 1.0

