In [2]:
import re
import nltk
import spacy
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Descargar stopwords de NLTK
nltk.download('stopwords')
nltk.download('punkt')
spacy.cli.download("en_core_web_sm")  # Descargar el modelo de spaCy
nlp = spacy.load("en_core_web_sm")  # Cargar modelo de spaCy en inglés

# Cargar stopwords en inglés de NLTK
stop_words = set(stopwords.words('english'))

# Ejemplo de review con su puntuación
reviews = [
    ("The product is amazing! I love it so much. Highly recommended!", 5),
    ("It's okay, not the best but not the worst either.", 3),
    ("Terrible experience. The quality is very bad.", 1)
]

def preprocess_text(text):
    # 1. Convertir a minúsculas
    text = text.lower()
    
    # 2. Eliminar caracteres especiales y números
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 3. Tokenización y lematización con spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct]
    
    # 4. Unir tokens procesados
    return ' '.join(tokens)

# Aplicar preprocesamiento a cada review
processed_reviews = [(preprocess_text(text), score) for text, score in reviews]

# Convertir a DataFrame para su uso en ML
df = pd.DataFrame(processed_reviews, columns=['review', 'score'])

# Vectorización con TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['score']

# Verificar si hay suficientes datos para dividir
if len(df) > 1:
    # División de datos para entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Entrenar un modelo simple (Naive Bayes)
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Evaluación
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
else:
    print("No hay suficientes datos para entrenar el modelo.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alumno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alumno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Accuracy: 0.00
