Now, we will proceed with the tokenization of the words that correspond to the events. The complete list of events, considering their description and event code can be downloaded from here: [link.](https://www.gdeltproject.org/data/lookups/CAMEO.eventcodes.txt)

First, we need to import all the libraries that we will use for the tokenization:

In [None]:
import pandas as pd

# Language detection
# !pip install google-cloud-translate
!pip install langdetect
from langdetect import detect, LangDetectException

import re # RegEx
import string
from sklearn.feature_extraction.text import TfidfVectorizer # To vectorize

# For stopwords
import nltk
from nltk.corpus import stopwords
# For tokenization and lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# For punctuation
import unicodedata

from google.colab import files

from sentence_transformers import SentenceTransformer
!pip install -U sentence-transformers

from sklearn.metrics.pairwise import cosine_similarity

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m26.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=b2747646160c5912be8f11617a30445dc1a62420f81e74208fa9852c7723821e
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


After that, we define a dictionary with the events we have already chosen (the ones that were interesting for us to investigate):

In [None]:
event_dict = {
    1413:   'Demonstrate for rights',
    1414:   'Demonstrate for change in institutions, regime',
    1431:   'Conduct strike or boycott for leadership change',
    1432:   'Conduct strike or boycott for policy change',
    1433:   'Conduct strike or boycott for rights',
    1434:   'Conduct strike or boycott for change in institutions, regime',
    1451:   'Engage in violent protest for leadership change',
    1452:   'Engage in violent protest for policy change',
    1453:   'Engage in violent protest for rights',
    1454:   'Engage in violent protest for change in institutions, regime',
    161:    'Reduce or break diplomatic relations',
    163:    'Impose embargo, boycott, or sanctions',
    174:    'Expel or deport individuals',
    175:    'Use tactics of violent repression',
    176:    'Attack cybernetically',
    190:    'Use conventional military force, not specified below',
    191:    'Impose blockade, restrict movement',
    192:    'Occupy territory',
    193:    'Fight with small arms and light weapons',
    194:    'Fight with artillery and tanks',
    196:    'Violate ceasefire',
    200:    'Use unconventional mass violence, not specified below',
    201:    'Engage in mass expulsion',
    202:    'Engage in mass killings',
    203:    'Engage in ethnic cleansing',
    2041:   'Use chemical, biological, or radiological weapons',
    2042:   'Detonate nuclear weapons',
}

After that, we transform the dictionary into a pandas dataframe.

In [None]:
event_series = pd.Series(event_dict)
event_df = event_series.reset_index()
event_df.columns = ['EventCode', 'Description']
event_df

Unnamed: 0,EventCode,Description
0,1413,Demonstrate for rights
1,1414,"Demonstrate for change in institutions, regime"
2,1431,Conduct strike or boycott for leadership change
3,1432,Conduct strike or boycott for policy change
4,1433,Conduct strike or boycott for rights
5,1434,Conduct strike or boycott for change in instit...
6,1451,Engage in violent protest for leadership change
7,1452,Engage in violent protest for policy change
8,1453,Engage in violent protest for rights
9,1454,Engage in violent protest for change in instit...


We apply the tokenization to the dataframe:

In [None]:
# Define stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define lemmatizer
lemmatizer = WordNetLemmatizer()

# Define puntuation
# nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def remove_punct(text):
    # Get rid of any punctuation characteraccording to Unicode
    return ''.join(ch for ch in text if not unicodedata.category(ch).startswith('P'))

In [None]:
def clean_and_lemmatize(text):
    text = re.sub(r'\[.*?\]', '', text) # Get rid of stuff like "[Verse 1]", "[Intro]" or "[Artist]"
    text = text.lower() # Lowercase
    text = remove_punct(text) # Get rid of punctuation
    tokens = word_tokenize(text) # Split to tokenize and get rid of stopwords
    tokens = [word for word in tokens if word not in stop_words] # Get rid of stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
    return " ".join(tokens) # Rejoin to make the text again

In [None]:
# Clean the lyrics
event_df['Tokens'] = event_df['Description'].apply(clean_and_lemmatize)

With this being done, we get our dictionary tokenized for the events.

In [None]:
event_df

Unnamed: 0,EventCode,Description,Tokens
0,1413,Demonstrate for rights,demonstrate right
1,1414,"Demonstrate for change in institutions, regime",demonstrate change institution regime
2,1431,Conduct strike or boycott for leadership change,conduct strike boycott leadership change
3,1432,Conduct strike or boycott for policy change,conduct strike boycott policy change
4,1433,Conduct strike or boycott for rights,conduct strike boycott right
5,1434,Conduct strike or boycott for change in instit...,conduct strike boycott change institution regime
6,1451,Engage in violent protest for leadership change,engage violent protest leadership change
7,1452,Engage in violent protest for policy change,engage violent protest policy change
8,1453,Engage in violent protest for rights,engage violent protest right
9,1454,Engage in violent protest for change in instit...,engage violent protest change institution regime


In [None]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# We assume that event_df already exists with columns [‘EventCode’, ‘Description’, ‘Tokens’]
# Vectorize column 'Tokens'
event_embeddings = model.encode(event_df['Tokens'].tolist())

# Convert embeddings into a DataFrame
embedding_cols = [f'dim_{i}' for i in range(event_embeddings.shape[1])]
embeddings_df = pd.DataFrame(event_embeddings, columns=embedding_cols)

# Combine with EventCode
event_embeddings_df = pd.concat([event_df[['EventCode']], embeddings_df], axis=1)

# Show embeddings
event_embeddings_df

Unnamed: 0,EventCode,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_374,dim_375,dim_376,dim_377,dim_378,dim_379,dim_380,dim_381,dim_382,dim_383
0,1413,-0.02164,0.092606,0.000398,-0.010073,-0.01941,0.073137,0.019721,0.018777,-0.044716,...,0.084632,0.086719,0.000282,0.004671,-0.062479,0.066642,0.126728,0.185229,0.001672,0.027591
1,1414,-0.01042,0.030111,0.033047,-0.031883,0.000189,0.04197,-0.07694,-0.07316,0.014532,...,0.02936,-0.013575,0.064565,0.043103,-0.046829,0.034354,0.049309,-0.042716,0.03105,-0.065633
2,1431,0.000552,0.0612,0.072482,0.00793,0.073794,0.081634,0.015473,-0.036991,0.006676,...,0.084282,-0.013112,-0.00236,-0.002201,-0.0643,-0.024541,0.133183,-0.075717,-0.035333,0.032761
3,1432,0.00694,0.068486,0.06569,-0.017111,0.081493,0.105526,0.063645,-0.029573,-0.025323,...,0.086302,-0.031998,-0.026142,-0.000712,-0.022631,0.006062,0.09412,-0.060563,-0.042275,0.01825
4,1433,0.006056,0.084824,0.035867,0.016662,0.054566,0.102494,0.079147,-0.054465,-0.006691,...,0.084619,0.013784,-0.002689,-0.003078,-0.018814,-0.01693,0.132929,0.005337,-0.040209,0.041834
5,1434,0.007744,0.042949,0.032382,-0.03487,0.021768,0.072331,0.029142,-0.048709,0.014116,...,0.062094,-0.043896,0.008525,0.012271,-0.049681,0.006671,0.121421,-0.088677,-0.032237,-0.019968
6,1451,0.067913,0.031424,0.00529,0.035436,0.072055,0.093044,-0.004432,-0.017399,0.005915,...,0.018401,0.006124,0.026362,0.051437,-0.032771,0.013837,0.075228,-0.06512,-0.016181,0.003674
7,1452,0.085912,0.025451,0.000506,0.000614,0.086107,0.117108,0.057488,0.011871,-0.022412,...,0.028132,-0.004207,0.000624,0.049598,0.003513,0.049081,0.041876,-0.054227,-0.008599,-0.014399
8,1453,0.09117,0.046158,-0.035479,0.049433,0.042945,0.134539,0.047756,-0.01267,-1.2e-05,...,0.038628,0.055938,0.042775,0.035912,0.021026,0.020556,0.107957,0.016643,-0.027849,0.006418
9,1454,0.046047,0.006502,-0.025134,-0.021724,0.030338,0.075708,0.011283,-0.028784,0.022648,...,0.022676,-0.013616,0.034456,0.077416,-0.000744,0.056243,0.056805,-0.076115,-0.014007,-0.060867


In [None]:
# Save as CSV
event_embeddings_df.to_csv("event_embeddings.csv", index=False)

files.download("event_embeddings.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Embeddings test


In [None]:
embedding_df

Unnamed: 0,Lyrics_Embedding,Event_Embedding
0,0.038402,0.095709
1,-0.006545,0.020431
2,0.070753,-0.038726
3,0.021017,0.026122
4,0.132701,0.057468
...,...,...
379,0.052563,0.031954
380,0.138161,0.086054
381,-0.031101,-0.016860
382,-0.043779,-0.018039


In [None]:
lyrics_vec

array([ 3.84021774e-02, -6.54496299e-03,  7.07525909e-02,  2.10173931e-02,
        1.32700771e-01,  3.55593115e-02,  9.49487761e-02, -7.64409453e-02,
        4.63715568e-02, -3.71928848e-02,  4.17159032e-03, -6.98366482e-03,
        8.50878842e-03,  4.38331552e-02,  3.08998097e-02,  3.25566791e-02,
       -2.30069570e-02,  1.12047438e-02, -6.15467429e-02, -4.87102121e-02,
       -8.60594772e-03,  4.52064425e-02, -2.57961731e-03,  8.13151374e-02,
       -6.88630715e-02,  3.63689624e-02,  2.22282647e-03,  5.05578257e-02,
       -7.15991929e-02,  1.30130239e-02,  8.95130038e-02,  1.43196210e-02,
       -6.60066977e-02,  5.21461107e-02,  8.17506611e-02, -5.67109063e-02,
        5.47797140e-03,  1.11162197e-02, -3.83068845e-02,  2.44684350e-02,
        4.08621728e-02, -5.38341030e-02,  2.39306763e-02, -6.52450919e-02,
       -1.27832498e-02, -2.26569902e-02,  7.63467923e-02, -4.10908498e-02,
        8.46060738e-02, -5.57830147e-02,  7.75837377e-02,  2.41342988e-02,
       -5.82332211e-03, -

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Examples
lyrics = "burn hollywood burn smell riot protest streets anger"
event = "engage violent protest"

# Vectorize
lyrics_vec = model.encode(lyrics)
event_vec = model.encode(event)

# View the first 5 values as a DataFrame
embedding_df = pd.DataFrame({
    'Lyrics_Embedding': lyrics_vec,
    'Event_Embedding': event_vec
})
print(embedding_df.head())

# Calculate similarity
similarity = cosine_similarity([lyrics_vec], [event_vec])[0][0]
print(f"\n📏 Cosine similarity: {similarity:.3f}")

   Lyrics_Embedding  Event_Embedding
0          0.038402         0.095709
1         -0.006545         0.020431
2          0.070753        -0.038726
3          0.021017         0.026122
4          0.132701         0.057468

📏 Similaridad coseno: 0.471
