In [1]:
import re
import os
import json
import time
import pickle
import itertools
import random
from typing import Callable, Optional

import numpy as np
import pandas as pd
from datasets import load_dataset

import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS

import emoji

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

import mlflow
import mlflow.sklearn

In [2]:
SEED = 42
random.seed(SEED); np.random.seed(SEED)

MLFLOW_URI = "file:./mlruns"
EXPERIMENT_NAME = "sentiment140_ablation_preproc"
mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

# SpaCy model (English)
SPACY_MODEL = "en_core_web_lg"

In [3]:
URL_RE = re.compile(r"(https?://\S+|www\.\S+)")
MENTION_RE = re.compile(r"@\w+")
HASHTAG_RE = re.compile(r"#")
MULTI_SPACE_RE = re.compile(r"\s+")
PUNCT_RE = re.compile(r"[^\w\s]")  # opción simple para quitar puntuación

def normalize_elongation_token(token: str) -> str:
    # Reduce repeticiones >2 a 2 (goooood -> good)
    return re.sub(r'(.)\1{2,}', r'\1\1', token)

def demojize_text(text: str) -> str:
    # convertir emojis a texto :smile:
    return emoji.demojize(text, delimiters=(" ", " "))

def remove_emojis(text: str) -> str:
    return emoji.replace_emoji(text, replace=" ")

# Factory: devuelve función preprocess(text)
def preprocess_factory(
    nlp: Optional[English],
    lemmatize: bool = False,
    drop_stopwords: bool = False,
    handle_emojis: str = "keep",  # "keep" | "demojize" | "remove"
    drop_punct: bool = False,
    normalize_elong: bool = False,
):
    stopwords = set(SPACY_STOPWORDS)

    def preprocess(text: str) -> str:
        if text is None:
            return ""
        x = text.lower()
        # quitar urls y menciones
        x = URL_RE.sub(" ", x)
        x = MENTION_RE.sub(" ", x)
        # remover símbolo # pero conservar la palabra
        x = HASHTAG_RE.sub("", x)

        # emojis
        if handle_emojis == "demojize":
            x = demojize_text(x)
        elif handle_emojis == "remove":
            x = remove_emojis(x)
        # normalizar espacios
        x = MULTI_SPACE_RE.sub(" ", x).strip()

        # uso spaCy para tokenizar/lematizar/stopwords/punct
        if nlp is None:
            # fallback simple tokenization
            toks = x.split()
            toks_proc = []
            for t in toks:
                if normalize_elong:
                    t = normalize_elongation_token(t)
                if drop_punct:
                    t = PUNCT_RE.sub("", t)
                if drop_stopwords and t in stopwords:
                    continue
                toks_proc.append(t)
            return " ".join(toks_proc)

        doc = nlp(x)
        out_tokens = []
        for tok in doc:
            txt = tok.text
            if normalize_elong:
                txt = normalize_elongation_token(txt)
            if drop_punct and tok.is_punct:
                continue
            if drop_stopwords and tok.text in stopwords:
                continue
            if lemmatize:
                lemma = tok.lemma_.strip()
                if lemma == "-PRON-":
                    lemma = tok.text
                if lemma != "":
                    out_tokens.append(lemma)
            else:
                out_tokens.append(txt)
        return " ".join(out_tokens)

    return preprocess

In [4]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", 
                 encoding="latin-1", 
                 names=["target", "ids", "date", "flag", "user", "text"])

df.head()


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
# Undersample: 20000 samples per target class
df_sampled = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)

# Preprocesamiento: usando preprocess_factory con spaCy
nlp = spacy.load(SPACY_MODEL, disable=["parser", "ner"])
preprocess = preprocess_factory(
    nlp=nlp,
    lemmatize=True,
    drop_stopwords=True,
    handle_emojis="demojize",
    drop_punct=True,
    normalize_elong=True
)

df_sampled['text_preproc'] = df_sampled['text'].apply(preprocess)
df_sampled.head()

  df_sampled = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)


Unnamed: 0,target,ids,date,flag,user,text,text_preproc
0,0,1974671194,Sat May 30 13:36:31 PDT 2009,NO_QUERY,simba98,@xnausikaax oh no! where did u order from? tha...,oh u order horrible
1,0,1997882236,Mon Jun 01 17:37:11 PDT 2009,NO_QUERY,Seve76,A great hard training weekend is over. a coup...,great hard training weekend couple day rest le...
2,0,2177756662,Mon Jun 15 06:39:05 PDT 2009,NO_QUERY,x__claireyy__x,"Right, off to work Only 5 hours to go until I...",right work 5 hour free xd
3,0,2216838047,Wed Jun 17 20:02:12 PDT 2009,NO_QUERY,Balasi,I am craving for japanese food,crave japanese food
4,0,1880666283,Fri May 22 02:03:31 PDT 2009,NO_QUERY,djrickdawson,Jean Michel Jarre concert tomorrow gotta work...,jean michel jarre concert tomorrow get to work...


In [7]:
# Undersample: 20000 samples per target class
df_sampled_2 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)

# Preprocesamiento: usando preprocess_factory con spaCy
nlp = spacy.load(SPACY_MODEL, disable=["parser", "ner"])
preprocess = preprocess_factory(
    nlp=nlp,
    lemmatize=False,
    drop_stopwords=True,
    handle_emojis="demojize",
    drop_punct=True,
    normalize_elong=True
)

df_sampled_2['text_preproc'] = df_sampled_2['text'].apply(preprocess)
df_sampled_2.head()

  df_sampled_2 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)


Unnamed: 0,target,ids,date,flag,user,text,text_preproc
0,0,1974671194,Sat May 30 13:36:31 PDT 2009,NO_QUERY,simba98,@xnausikaax oh no! where did u order from? tha...,oh u order horrible
1,0,1997882236,Mon Jun 01 17:37:11 PDT 2009,NO_QUERY,Seve76,A great hard training weekend is over. a coup...,great hard training weekend couple days rest l...
2,0,2177756662,Mon Jun 15 06:39:05 PDT 2009,NO_QUERY,x__claireyy__x,"Right, off to work Only 5 hours to go until I...",right work 5 hours free xd
3,0,2216838047,Wed Jun 17 20:02:12 PDT 2009,NO_QUERY,Balasi,I am craving for japanese food,craving japanese food
4,0,1880666283,Fri May 22 02:03:31 PDT 2009,NO_QUERY,djrickdawson,Jean Michel Jarre concert tomorrow gotta work...,jean michel jarre concert tomorrow got ta work...


In [8]:
# Undersample: 20000 samples per target class
df_sampled_3 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)

# Preprocesamiento: usando preprocess_factory con spaCy
nlp = spacy.load(SPACY_MODEL, disable=["parser", "ner"])
preprocess = preprocess_factory(
    nlp=nlp,
    lemmatize=False,
    drop_stopwords=False,
    handle_emojis="demojize",
    drop_punct=True,
    normalize_elong=True
)

df_sampled_3['text_preproc'] = df_sampled_3['text'].apply(preprocess)
df_sampled_3.head()

  df_sampled_3 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)


Unnamed: 0,target,ids,date,flag,user,text,text_preproc
0,0,1974671194,Sat May 30 13:36:31 PDT 2009,NO_QUERY,simba98,@xnausikaax oh no! where did u order from? tha...,oh no where did u order from that 's horrible
1,0,1997882236,Mon Jun 01 17:37:11 PDT 2009,NO_QUERY,Seve76,A great hard training weekend is over. a coup...,a great hard training weekend is over a couple...
2,0,2177756662,Mon Jun 15 06:39:05 PDT 2009,NO_QUERY,x__claireyy__x,"Right, off to work Only 5 hours to go until I...",right off to work only 5 hours to go until i '...
3,0,2216838047,Wed Jun 17 20:02:12 PDT 2009,NO_QUERY,Balasi,I am craving for japanese food,i am craving for japanese food
4,0,1880666283,Fri May 22 02:03:31 PDT 2009,NO_QUERY,djrickdawson,Jean Michel Jarre concert tomorrow gotta work...,jean michel jarre concert tomorrow got ta work...


In [9]:
# Undersample: 20000 samples per target class
df_sampled_4 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)

# Preprocesamiento: usando preprocess_factory con spaCy
nlp = spacy.load(SPACY_MODEL, disable=["parser", "ner"])
preprocess = preprocess_factory(
    nlp=nlp,
    lemmatize=True,
    drop_stopwords=False,
    handle_emojis="remove",
    drop_punct=False,
    normalize_elong=True
)

df_sampled_4['text_preproc'] = df_sampled_4['text'].apply(preprocess)
df_sampled_4.head()

  df_sampled_4 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)


Unnamed: 0,target,ids,date,flag,user,text,text_preproc
0,0,1974671194,Sat May 30 13:36:31 PDT 2009,NO_QUERY,simba98,@xnausikaax oh no! where did u order from? tha...,oh no ! where do u order from ? that be horrible
1,0,1997882236,Mon Jun 01 17:37:11 PDT 2009,NO_QUERY,Seve76,A great hard training weekend is over. a coup...,a great hard training weekend be over . a coup...
2,0,2177756662,Mon Jun 15 06:39:05 PDT 2009,NO_QUERY,x__claireyy__x,"Right, off to work Only 5 hours to go until I...","right , off to work only 5 hour to go until I ..."
3,0,2216838047,Wed Jun 17 20:02:12 PDT 2009,NO_QUERY,Balasi,I am craving for japanese food,I be crave for japanese food
4,0,1880666283,Fri May 22 02:03:31 PDT 2009,NO_QUERY,djrickdawson,Jean Michel Jarre concert tomorrow gotta work...,jean michel jarre concert tomorrow get to work...


In [10]:
# Undersample: 20000 samples per target class
df_sampled_5 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)

# Preprocesamiento: usando preprocess_factory con spaCy
nlp = spacy.load(SPACY_MODEL, disable=["parser", "ner"])
preprocess = preprocess_factory(
    nlp=nlp,
    lemmatize=True,
    drop_stopwords=False,
    handle_emojis="demojize",
    drop_punct=False,
    normalize_elong=False
)

df_sampled_5['text_preproc'] = df_sampled_5['text'].apply(preprocess)
df_sampled_5.head()

  df_sampled_5 = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=20000, random_state=SEED)).reset_index(drop=True)


Unnamed: 0,target,ids,date,flag,user,text,text_preproc
0,0,1974671194,Sat May 30 13:36:31 PDT 2009,NO_QUERY,simba98,@xnausikaax oh no! where did u order from? tha...,oh no ! where do u order from ? that be horrible
1,0,1997882236,Mon Jun 01 17:37:11 PDT 2009,NO_QUERY,Seve76,A great hard training weekend is over. a coup...,a great hard training weekend be over . a coup...
2,0,2177756662,Mon Jun 15 06:39:05 PDT 2009,NO_QUERY,x__claireyy__x,"Right, off to work Only 5 hours to go until I...","right , off to work only 5 hour to go until I ..."
3,0,2216838047,Wed Jun 17 20:02:12 PDT 2009,NO_QUERY,Balasi,I am craving for japanese food,I be crave for japanese food
4,0,1880666283,Fri May 22 02:03:31 PDT 2009,NO_QUERY,djrickdawson,Jean Michel Jarre concert tomorrow gotta work...,jean michel jarre concert tomorrow get to work...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram TF-IDF
tfidf_uni = TfidfVectorizer()
X_uni_1 = tfidf_uni.fit_transform(df_sampled['text_preproc'])
X_uni_2 = tfidf_uni.fit_transform(df_sampled_2['text_preproc'])
X_uni_3 = tfidf_uni.fit_transform(df_sampled_3['text_preproc'])
X_uni_4 = tfidf_uni.fit_transform(df_sampled_4['text_preproc'])
X_uni_5 = tfidf_uni.fit_transform(df_sampled_5['text_preproc'])

# Bigram TF-IDF
tfidf_bi = TfidfVectorizer(ngram_range=(2, 2))
X_bi_1 = tfidf_bi.fit_transform(df_sampled['text_preproc'])
X_bi_2 = tfidf_bi.fit_transform(df_sampled_2['text_preproc'])
X_bi_3 = tfidf_bi.fit_transform(df_sampled_3['text_preproc'])
X_bi_4 = tfidf_bi.fit_transform(df_sampled_4['text_preproc'])
X_bi_5 = tfidf_bi.fit_transform(df_sampled_5['text_preproc'])

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

def evaluar_datasets(datasets, ngram_range=(1,1), prueba="Unigrama"):
    resultados = []
    
    for i, df in enumerate(datasets, start=1):
        X = df["text_preproc"]
        y = df["target"]

        # Vectorización TF-IDF
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=ngram_range)
        X_tfidf = vectorizer.fit_transform(X)

        # División 80/10/10 estratificada
        X_train, X_temp, y_train, y_temp = train_test_split(
            X_tfidf, y, test_size=0.2, stratify=y, random_state=42
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
        )

        # Entrenamiento con regresión logística
        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        # Evaluación en test
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred, average="weighted")

        resultados.append((f"Modelo {i}", f1))
    
    print(f"\nResultados {prueba}:")
    for modelo, score in resultados:
        print(f"{modelo}: F1-score = {score:.4f}")


In [18]:
# Lista con los 5 datasets
datasets = [df_sampled, df_sampled_2, df_sampled_3, df_sampled_4, df_sampled_5]

# Prueba 1: TF-IDF Unigrama
evaluar_datasets(datasets, ngram_range=(1,1), prueba="TF-IDF Unigrama")

# Prueba 2: TF-IDF Bigrama
evaluar_datasets(datasets, ngram_range=(1,2), prueba="TF-IDF Bigrama")



Resultados TF-IDF Unigrama:
Modelo 1: F1-score = 0.7379
Modelo 2: F1-score = 0.7404
Modelo 3: F1-score = 0.7727
Modelo 4: F1-score = 0.7742
Modelo 5: F1-score = 0.7730

Resultados TF-IDF Bigrama:
Modelo 1: F1-score = 0.7399
Modelo 2: F1-score = 0.7411
Modelo 3: F1-score = 0.7722
Modelo 4: F1-score = 0.7765
Modelo 5: F1-score = 0.7770
