In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, SimpleRNN

In [None]:
epochs = 15

In [None]:
# Remapear las categorías de sentimiento a tres categorías
def simplify_sentiment(sentiment):
    if sentiment in [0, 1]:
        return 0  # Negative
    elif sentiment == 2:
        return 1  # Neutral
    else:
        return 2  # Positive

In [None]:
# Cargar el dataset desde un archivo .tsv
data = pd.read_csv('train.tsv', sep='\t')

# Rellenar valores faltantes con un string vacío
data['Phrase'] = data['Phrase'].fillna('')

data['Sentiment'] = data['Sentiment'].apply(simplify_sentiment)

# Separar datos en características y etiquetas
X = data['Phrase']
y = data['Sentiment']

# Codificación de etiquetas
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Tokenizar y preparar las secuencias
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=100)

# Dividir el dataset en entrenamiento y prueba (estratificado)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=13)
for train_index, test_index in sss.split(X_padded, y):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]


# Dividir el dataset en entrenamiento y prueba para Naive Bayes
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

In [None]:
# Función para construir modelos y entrenar
def create_and_train_rnn_model(model_type, X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))

    if model_type == 'RNN':
        model.add(SimpleRNN(128))
    elif model_type == 'LSTM':
        model.add(LSTM(128))
    elif model_type == 'GRU':
        model.add(GRU(128))

    model.add(Dense(3, activation='softmax'))  # Tres categorías

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Entrenar el modelo
    model.fit(X_train, y_train, epochs=epochs, batch_size=32, validation_split=0.2)

    # Evaluar el modelo
    y_pred = np.argmax(model.predict(X_test), axis=1)
    print(f"Confusion Matrix - {model_type}")
    print(confusion_matrix(y_test, y_pred))
    print(f"Classification Report - {model_type}")
    print(classification_report(y_test, y_pred))

    # Calcular y retornar F1 Score
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Weighted F1 Score - {model_type}: {f1:.4f}")

    return f1

In [None]:
# ---------------------------------------------
# Implementación de Naive Bayes
# ---------------------------------------------

# Crear el pipeline para el modelo de Naive Bayes
naive_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Entrenar el modelo Naive Bayes
naive_model.fit(X_train_nb, y_train_nb)

# Generar predicciones
y_pred_nb = naive_model.predict(X_test_nb)

# Evaluar el modelo Naive Bayes
print(f"Confusion Matrix - Naive Bayes")
print(confusion_matrix(y_test_nb, y_pred_nb))
print(f"Classification Report - Naive Bayes")
print(classification_report(y_test_nb, y_pred_nb))

# Calcular F1 Score para Naive Bayes
nb_f1 = f1_score(y_test_nb, y_pred_nb, average='weighted')
print(f"Weighted F1 Score - Naive Bayes: {nb_f1:.4f}")

# Entrenar y comparar los otros modelos
rnn_f1 = create_and_train_rnn_model('RNN', X_train, y_train, X_test, y_test)
lstm_f1 = create_and_train_rnn_model('LSTM', X_train, y_train, X_test, y_test)
gru_f1 = create_and_train_rnn_model('GRU', X_train, y_train, X_test, y_test)

# Comparar resultados
results = {
    "RNN": rnn_f1,
    "LSTM": lstm_f1,
    "GRU": gru_f1,
    "Naive Bayes": nb_f1
}

best_model = max(results, key=results.get)
print(f"Best model is {best_model} with a weighted F1 score of {results[best_model]:.4f}")

Confusion Matrix - Naive Bayes
[[ 4237  2261   371]
 [ 1939 11519  2458]
 [  365  2424  5638]]
Classification Report - Naive Bayes
              precision    recall  f1-score   support

           0       0.65      0.62      0.63      6869
           1       0.71      0.72      0.72     15916
           2       0.67      0.67      0.67      8427

    accuracy                           0.69     31212
   macro avg       0.67      0.67      0.67     31212
weighted avg       0.68      0.69      0.69     31212

Weighted F1 Score - Naive Bayes: 0.6850




Epoch 1/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 67ms/step - accuracy: 0.5929 - loss: 0.8954 - val_accuracy: 0.7173 - val_loss: 0.7000
Epoch 2/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 66ms/step - accuracy: 0.7408 - loss: 0.6435 - val_accuracy: 0.7207 - val_loss: 0.6912
Epoch 3/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 66ms/step - accuracy: 0.7689 - loss: 0.5748 - val_accuracy: 0.7385 - val_loss: 0.6570
Epoch 4/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 64ms/step - accuracy: 0.7862 - loss: 0.5284 - val_accuracy: 0.7263 - val_loss: 0.7041
Epoch 5/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 63ms/step - accuracy: 0.7994 - loss: 0.4940 - val_accuracy: 0.7299 - val_loss: 0.6873
Epoch 6/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 63ms/step - accuracy: 0.8003 - loss: 0.4915 - val_accuracy: 0.7355 - val_loss: 0.693



[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m656s[0m 209ms/step - accuracy: 0.6358 - loss: 0.8203 - val_accuracy: 0.7317 - val_loss: 0.6563
Epoch 2/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m652s[0m 209ms/step - accuracy: 0.7594 - loss: 0.5886 - val_accuracy: 0.7428 - val_loss: 0.6327
Epoch 3/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 210ms/step - accuracy: 0.7919 - loss: 0.5118 - val_accuracy: 0.7469 - val_loss: 0.6395
Epoch 4/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 212ms/step - accuracy: 0.8143 - loss: 0.4552 - val_accuracy: 0.7456 - val_loss: 0.6623
Epoch 5/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 211ms/step - accuracy: 0.8259 - loss: 0.4194 - val_accuracy: 0.7443 - val_loss: 0.6848
Epoch 6/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m656s[0m 210ms/step - accuracy: 0.8384 - loss: 0.3841 - val_accuracy: 0.7362 - val_loss: 0.7350
Epo



[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m565s[0m 180ms/step - accuracy: 0.6407 - loss: 0.8131 - val_accuracy: 0.7290 - val_loss: 0.6573
Epoch 2/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m577s[0m 185ms/step - accuracy: 0.7625 - loss: 0.5800 - val_accuracy: 0.7431 - val_loss: 0.6339
Epoch 3/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m606s[0m 180ms/step - accuracy: 0.7923 - loss: 0.5090 - val_accuracy: 0.7414 - val_loss: 0.6404
Epoch 4/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 180ms/step - accuracy: 0.8118 - loss: 0.4600 - val_accuracy: 0.7457 - val_loss: 0.6498
Epoch 5/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m579s[0m 185ms/step - accuracy: 0.8269 - loss: 0.4198 - val_accuracy: 0.7435 - val_loss: 0.6703
Epoch 6/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 185ms/step - accuracy: 0.8407 - loss: 0.3835 - val_accuracy: 0.7377 - val_loss: 0.7212
Epo

#With Preprocessing

In [None]:
def preprocess_text(text):
    # Eliminar caracteres especiales y números
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Convertir a minúsculas
    text = text.lower()

    # Eliminar stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Lematización
    words = [lemmatizer.lemmatize(word) for word in words]

    # Rejuntar las palabras en una frase
    return ' '.join(words)

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

# Cargar el dataset desde un archivo .tsv
data = pd.read_csv('train.tsv', sep='\t')

# Rellenar valores faltantes con un string vacío
data['Phrase'] = data['Phrase'].fillna('')

data['Sentiment'] = data['Sentiment'].apply(simplify_sentiment)

# Preprocesamiento del texto
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Aplicar preprocesamiento
data['Processed_Phrase'] = data['Phrase'].apply(preprocess_text)

# Separar datos en características y etiquetas
X = data['Processed_Phrase']
y = data['Sentiment']

# Codificación de etiquetas
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Tokenizar y preparar las secuencias
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=100)

# Dividir el dataset en entrenamiento y prueba (estratificado)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=13)
for train_index, test_index in sss.split(X_padded, y):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]

# ---------------------------------------------
# Implementación de Naive Bayes con preprocesamiento
# ---------------------------------------------

# Crear el pipeline para el modelo de Naive Bayes
naive_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Dividir el dataset en entrenamiento y prueba para Naive Bayes
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)

# Entrenar el modelo Naive Bayes
naive_model.fit(X_train_nb, y_train_nb)

# Generar predicciones
y_pred_nb = naive_model.predict(X_test_nb)

# Evaluar el modelo Naive Bayes
print(f"Confusion Matrix - Naive Bayes")
print(confusion_matrix(y_test_nb, y_pred_nb))
print(f"Classification Report - Naive Bayes")
print(classification_report(y_test_nb, y_pred_nb))

# Calcular F1 Score para Naive Bayes
nb_f1 = f1_score(y_test_nb, y_pred_nb, average='weighted')
print(f"Weighted F1 Score - Naive Bayes: {nb_f1:.4f}")

# Entrenar y comparar los modelos
rnn_f1 = create_and_train_rnn_model('RNN', X_train, y_train, X_test, y_test)
lstm_f1 = create_and_train_rnn_model('LSTM', X_train, y_train, X_test, y_test)
gru_f1 = create_and_train_rnn_model('GRU', X_train, y_train, X_test, y_test)

# Comparar resultados
results = {
    "RNN": rnn_f1,
    "LSTM": lstm_f1,
    "GRU": gru_f1,
    "Naive Bayes": nb_f1
}

best_model = max(results, key=results.get)
print(f"Best model is {best_model} with a weighted F1 score of {results[best_model]:.4f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Confusion Matrix - Naive Bayes
[[ 4058  2401   410]
 [ 1711 11854  2351]
 [  330  2518  5579]]
Classification Report - Naive Bayes
              precision    recall  f1-score   support

           0       0.67      0.59      0.63      6869
           1       0.71      0.74      0.73     15916
           2       0.67      0.66      0.67      8427

    accuracy                           0.69     31212
   macro avg       0.68      0.67      0.67     31212
weighted avg       0.69      0.69      0.69     31212

Weighted F1 Score - Naive Bayes: 0.6872
Epoch 1/15




[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 65ms/step - accuracy: 0.6159 - loss: 0.8622 - val_accuracy: 0.7133 - val_loss: 0.7055
Epoch 2/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m273s[0m 69ms/step - accuracy: 0.7392 - loss: 0.6505 - val_accuracy: 0.7231 - val_loss: 0.6831
Epoch 3/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 67ms/step - accuracy: 0.7672 - loss: 0.5766 - val_accuracy: 0.7286 - val_loss: 0.6858
Epoch 4/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 66ms/step - accuracy: 0.7786 - loss: 0.5472 - val_accuracy: 0.7273 - val_loss: 0.6954
Epoch 5/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 66ms/step - accuracy: 0.7944 - loss: 0.5089 - val_accuracy: 0.7243 - val_loss: 0.6991
Epoch 6/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 65ms/step - accuracy: 0.8041 - loss: 0.4805 - val_accuracy: 0.7234 - val_loss: 0.7068
Epoch 7/1



[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m645s[0m 206ms/step - accuracy: 0.6376 - loss: 0.8279 - val_accuracy: 0.7189 - val_loss: 0.6824
Epoch 2/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m688s[0m 208ms/step - accuracy: 0.7504 - loss: 0.6106 - val_accuracy: 0.7299 - val_loss: 0.6652
Epoch 3/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m724s[0m 221ms/step - accuracy: 0.7787 - loss: 0.5403 - val_accuracy: 0.7365 - val_loss: 0.6630
Epoch 4/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m702s[0m 208ms/step - accuracy: 0.7951 - loss: 0.4923 - val_accuracy: 0.7355 - val_loss: 0.6792
Epoch 5/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m681s[0m 208ms/step - accuracy: 0.8098 - loss: 0.4545 - val_accuracy: 0.7335 - val_loss: 0.7093
Epoch 6/15
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m656s[0m 210ms/step - accuracy: 0.8196 - loss: 0.4243 - val_accuracy: 0.7296 - val_loss: 0.7368
Epo