In [17]:
# Primero vemos algunas filas del texto preprocesado
import pandas as pd

df = pd.read_parquet("../data/train_preprocessed.parquet")
df.head(10)[["messages", "text_clean", "tokens", "lemmas"]]

Unnamed: 0,messages,text_clean,tokens,lemmas
0,Germany!\n\nJust the person I want to speak wi...,germany just the person i want to speak with i...,"['germany', 'person', 'want', 'speak', 'somewh...","['germany', 'person', 'want', 'speak', 'somewh..."
1,"You've whet my appetite, Italy. What's the sug...",you ve whet my appetite italy what s the sugge...,"['ve', 'whet', 'appetite', 'italy', 's', 'sugg...","['ve', 'whet', 'appetite', 'italy', 's', 'sugg..."
2,👍,,[],[]
3,It seems like there are a lot of ways that cou...,it seems like there are a lot of ways that cou...,"['like', 'lot', 'ways', 'wrong', 'don', 't', '...","['like', 'lot', 'way', 'wrong', 'don', 't', 'f..."
4,"Yeah, I can’t say I’ve tried it and it works, ...",yeah i can t say i ve tried it and it works ca...,"['yeah', 't', 've', 'tried', 'works', 'cause',...","['yeah', 't', 've', 'try', 'work', 'cause', 'v..."
5,I am just sensing that you don’t like this ide...,i am just sensing that you don t like this ide...,"['sensing', 'don', 't', 'like', 'idea', 'shall...","['sense', 'don', 't', 'like', 'idea', 'shall',..."
6,Any thoughts?,any thoughts,['thoughts'],['thought']
7,"Sorry Italy I've been away doing, um, German t...",sorry italy i ve been away doing um german thi...,"['sorry', 'italy', 've', 'away', 'um', 'german...","['sorry', 'italy', 've', 'away', 'um', 'german..."
8,"I don't think I'm ready to go for that idea, h...",i don t think i m ready to go for that idea ho...,"['don', 't', 'think', 'm', 'ready', 'idea', 'd...","['don', 't', 'think', 'm', 'ready', 'idea', 'd..."
9,I am pretty conflicted about whether to guess ...,i am pretty conflicted about whether to guess ...,"['pretty', 'conflicted', 'guess', 'telling', '...","['pretty', 'conflicted', 'guess', 'tell', 'tru..."


In [18]:
# Ahora vemos el tamaño del vocabulario de los modelos BoW y TF-IDF
import joblib
cv = joblib.load("../diplomacy/models/representations/bow_vectorizer.joblib")
tfidf = joblib.load("../diplomacy/models/representations/tfidf_vectorizer.joblib")

print("Vocabulario BoW:", len(cv.vocabulary_))
print("Vocabulario TF-IDF:", len(tfidf.vocabulary_))

# Mostrar algunas palabras del vocabulario
list(cv.vocabulary_.keys())[:20]

Vocabulario BoW: 10434
Vocabulario TF-IDF: 10434


['germany',
 'just',
 'the',
 'person',
 'want',
 'to',
 'speak',
 'with',
 'have',
 'somewhat',
 'crazy',
 'idea',
 'that',
 've',
 'always',
 'wanted',
 'try',
 'but',
 'never',
 'actually']

In [20]:
# Vamos a ver ejemplos de embeddings Word2Vec
from gensim.models import Word2Vec
w2v = Word2Vec.load("../diplomacy/models/embeddings/word2vec.model")

# Número de palabras aprendidas
print("Tamaño del vocabulario Word2Vec:", len(w2v.wv.key_to_index))

# Ver los términos más similares a una palabra
w2v.wv.most_similar("support", topn=5)

Tamaño del vocabulario Word2Vec: 2261


[('cut', 0.8187794089317322),
 ('supported', 0.8015599846839905),
 ('supporting', 0.7948482632637024),
 ('bump', 0.7930293679237366),
 ('supports', 0.7880041003227234)]

In [26]:
# Ahora vemos ejemplos de embeddings FastText 
from gensim.models import FastText
ft = FastText.load("../diplomacy/models/embeddings/fasttext.model")
ft.wv.most_similar("attack", topn=5)

[('attacks', 0.9825449585914612),
 ('supply', 0.9691567420959473),
 ('need', 0.9537312388420105),
 ('supposed', 0.9504920840263367),
 ('sup', 0.9486726522445679)]

In [None]:
# Ahora vemos la forma de los embeddings BERT
import numpy as np

bert_train = np.load("../diplomacy/models/embeddings/bert_train.npy")
print("Shape BERT embeddings:", bert_train.shape)

Shape BERT embeddings: (13137, 768)


In [None]:
# Ahora calculamos la cobertura del vocabulario Word2Vec
from collections import Counter
import numpy as np

def compute_coverage(tokens_list, model):
    total, covered = 0, 0
    oov = Counter()
    for tokens in tokens_list:
        for t in tokens:
            total += 1
            if t in model.wv:
                covered += 1
            else:
                oov[t] += 1
    return covered / total, oov

df = pd.read_parquet("../data/train_preprocessed.parquet")
coverage, oov = compute_coverage(df["tokens"], w2v)
print("Cobertura Word2Vec:", round(coverage * 100, 2), "%")
print("Palabras OOV más frecuentes:", oov.most_common(10))


Cobertura Word2Vec: 39.62 %
Palabras OOV más frecuentes: [("'", 239414), (',', 106791), (' ', 106791), ('n', 50253), ('a', 49666), ('i', 43434), ('l', 35197), ('[', 13137), (']', 13137), ('x', 739)]


In [None]:
# Finalmente, vemos la distribución de la longitud de los textos en tokens
df["len_tokens"] = df["tokens"].apply(len)
df["len_tokens"].describe()

count    13137.000000
mean        83.171729
std         88.711164
min          2.000000
25%         28.000000
50%         56.000000
75%        106.000000
max       1127.000000
Name: len_tokens, dtype: float64

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import scipy.sparse as sp

# Cargar TF-IDF
X = sp.load_npz("../diplomacy/models/representations/X_tfidf_train.npz")
y = pd.read_parquet("../data/train_preprocessed.parquet")["sender_labels"]

# Convertir etiquetas a numéricas
y = (y == "True").astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       119
           1       0.95      1.00      0.98      2509

    accuracy                           0.95      2628
   macro avg       0.48      0.50      0.49      2628
weighted avg       0.91      0.95      0.93      2628



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
