In [6]:
import pandas as pd

train_set = "fake_news_spanish.csv"
df = pd.read_csv(train_set, sep=';', encoding='utf-8')


In [11]:
import pandas as pd
import re
import unicodedata
import inflect
import joblib

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin

# Asegúrate de descargar esto una vez en tu entorno
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')


# 
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('spanish'))
        self.stemmer = LancasterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.p = inflect.engine()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [' '.join(self.stem_and_lemmatize(self.clean_text(text))) for text in X]

    def clean_text(self, text):
        words = word_tokenize(text)
        words = [word.lower() for word in words]
        words = [self.p.number_to_words(word) if word.isdigit() else word for word in words]
        words = [re.sub(r'[^\w\s]', '', word) for word in words]
        words = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in words]
        words = [word for word in words if word not in self.stop_words and word.strip()]
        return words

    def stem_and_lemmatize(self, words):
        stems = [self.stemmer.stem(word) for word in words]
        lemmas = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
        return stems + lemmas

# Cargar tus datos (ajusta esta parte si ya los tienes listos)
# Suponiendo que ya tienes `copy` con tus columnas limpias


# Crear la columna combinada de texto crudo
df['texto'] = df['Descripcion'].fillna('') + ' ' + df['Titulo'].fillna('')
X = df['texto']
y = df['Label']

# Crear el pipeline completo
pipeline = Pipeline([
    ('preprocessing', Preprocessor()),          #  clase de limpieza personalizada
    ('tfidf', TfidfVectorizer()),               # Vectorización
    ('clf', MultinomialNB(alpha=1.0))           # Clasificador Naive Bayes
])

# Entrenar el pipeline
pipeline.fit(X, y)

# Guardar el modelo entrenado
joblib.dump(pipeline, 'modelo_pipeline.joblib')
print(" Modelo entrenado y guardado como 'modelo_pipeline.joblib'")


Index(['ID', 'Label', 'Titulo', 'Descripcion', 'Fecha'], dtype='object')


KeyboardInterrupt: 