In [None]:
import pandas as pd
import re
import nltk
import pickle
from sklearn.model_selection import train_test_split

nltk.download('punkt')

# Cargar el dataset
df = pd.read_csv('/content/twitter_training.csv', header=None)
df.columns = ['ID', 'Entity', 'Sentiment', 'Content']

# Filtrar solo sentimientos válidos
df = df[df['Sentiment'].isin(['Positive', 'Negative', 'Neutral'])]

# Normalizar sentimiento
df['Sentiment'] = df['Sentiment'].str.lower()

# Función de limpieza segura
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # quitar URLs
    text = re.sub(r"@\w+", "", text)     # quitar menciones
    text = re.sub(r"#\w+", "", text)     # quitar hashtags
    text = re.sub(r"[^\w\s]", "", text)  # quitar puntuación
    text = re.sub(r"\d+", "", text)      # quitar números
    text = re.sub(r"\s+", " ", text).strip()  # quitar espacios múltiples
    return text

# Aplicar limpieza a valores no nulos
df['Cleaned_Content'] = df['Content'].fillna("").apply(clean_text)


# Dividir datos
train_df, test_df = train_test_split(df[['Cleaned_Content', 'Sentiment']], test_size=0.3, stratify=df['Sentiment'], random_state=42)

# Guardar en Pickle
with open('/content/cleaned_train.pkl', 'wb') as f:
    pickle.dump(train_df, f)

with open('/content/cleaned_test.pkl', 'wb') as f:
    pickle.dump(test_df, f)

print("Dataset limpio y dividido guardado como .pkl")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset limpio y dividido guardado como .pkl
