In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [2]:
train_df = pd.read_csv('C:/Users/cesco/Desktop/Personal/UPY/9/NLP/proyecto/train.csv', header=None, names=['polarity', 'summary', 'reviewText'])

In [3]:
# Descargar las stopwords de NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Cargar los datos (reemplazar con la ruta correcta si es necesario)
train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'summary', 'reviewText'])

# Reemplazar NaN por cadenas vacías en las columnas de texto
train_df['summary'] = train_df['summary'].fillna('')
train_df['reviewText'] = train_df['reviewText'].fillna('')

# Función para preprocesar el texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres no alfabéticos (números, signos de puntuación, etc.)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Eliminar stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Aplicar el preprocesamiento a las columnas de texto
train_df['cleaned_summary'] = train_df['summary'].apply(preprocess_text)
train_df['cleaned_reviewText'] = train_df['reviewText'].apply(preprocess_text)

# Verificar cómo quedaron las reseñas
print(train_df[['summary', 'cleaned_summary', 'reviewText', 'cleaned_reviewText']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cesco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                             summary  \
0                     Stuning even for the non-gamer   
1              The best soundtrack ever to anything.   
2                                           Amazing!   
3                               Excellent Soundtrack   
4  Remember, Pull Your Jaw Off The Floor After He...   

                   cleaned_summary  \
0            stuning even nongamer   
1    best soundtrack ever anything   
2                          amazing   
3             excellent soundtrack   
4  remember pull jaw floor hearing   

                                          reviewText  \
0  This sound track was beautiful! It paints the ...   
1  I'm reading a lot of reviews saying that this ...   
2  This soundtrack is my favorite music of all ti...   
3  I truly like this soundtrack and I enjoy video...   
4  If you've played the game, you know how divine...   

                                  cleaned_reviewText  
0  sound track beautiful paints senery min

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import numpy as np

# Dividir el dataset en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(train_df['cleaned_reviewText'], train_df['polarity'], test_size=0.2, random_state=42)

# Vectorizar los textos utilizando TFIDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Crear el clasificador de regresión logística
log_reg = LogisticRegression(max_iter=1000)

# Realizar la validación cruzada con K-folds (por ejemplo, 5 folds)
cross_val_scores = cross_val_score(log_reg, X_train_tfidf, y_train, cv=5, scoring='accuracy')

# Mostrar los resultados de la validación cruzada
print(f"Cross-validation scores (Logistic Regression): {cross_val_scores}")
print(f"Mean cross-validation score: {np.mean(cross_val_scores)}")

# Entrenar el modelo en todo el conjunto de entrenamiento (ya que K-fold ya hizo la validación)
log_reg.fit(X_train_tfidf, y_train)

# Predecir en los datos de prueba
y_pred = log_reg.predict(tfidf_vectorizer.transform(X_test))

# Mostrar el reporte de clasificación
print("Clasificación Regresión Logística")
print(classification_report(y_test, y_pred))


Cross-validation scores (Logistic Regression): [0.86578993 0.86598264 0.86582465 0.86681771 0.86624132]
Mean cross-validation score: 0.8661312500000001
Clasificación Regresión Logística
              precision    recall  f1-score   support

           1       0.87      0.86      0.87    359759
           2       0.86      0.87      0.87    360241

    accuracy                           0.87    720000
   macro avg       0.87      0.87      0.87    720000
weighted avg       0.87      0.87      0.87    720000



In [10]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping

# Codificar las etiquetas en números
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_df['polarity'])

# Tokenización del texto
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['cleaned_reviewText'])
X_seq = tokenizer.texts_to_sequences(train_df['cleaned_reviewText'])

# Padding de las secuencias
max_len = 80
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')

# Dividir el dataset en entrenamiento y prueba final (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Inicializar KFold para 5 splits
kf = KFold(n_splits=2, shuffle=True, random_state=42)

# Listas para almacenar las métricas de cada fold
fold_accuracies = []

# KFold Cross Validation
for train_index, val_index in kf.split(X_train):
    # Dividir el dataset en entrenamiento y validación para cada fold
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Crear el modelo DNN
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compilar el modelo
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # EarlyStopping para evitar sobreajuste
    early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    # Entrenar el modelo
    model.fit(X_train_fold, y_train_fold, epochs=4, batch_size=64, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stopping])

    # Evaluar el modelo en el conjunto de validación de este fold
    loss, accuracy = model.evaluate(X_val_fold, y_val_fold)
    fold_accuracies.append(accuracy)
    print(f"Accuracy en el fold: {accuracy:.4f}")

# Promedio de las precisiones de todos los folds
mean_accuracy = np.mean(fold_accuracies)
print(f"Precisión promedio (K-fold): {mean_accuracy:.4f}")

# Evaluación final en el conjunto de prueba
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy en el conjunto de prueba final: {accuracy:.4f}")


Epoch 1/4




[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m886s[0m 39ms/step - accuracy: 0.7077 - loss: 0.4822 - val_accuracy: 0.8817 - val_loss: 0.2795
Epoch 2/4
[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m868s[0m 39ms/step - accuracy: 0.8827 - loss: 0.2785 - val_accuracy: 0.8883 - val_loss: 0.2672
Epoch 3/4
[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m880s[0m 39ms/step - accuracy: 0.8881 - loss: 0.2667 - val_accuracy: 0.8915 - val_loss: 0.2602
Epoch 4/4
[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m884s[0m 39ms/step - accuracy: 0.8921 - loss: 0.2587 - val_accuracy: 0.8931 - val_loss: 0.2575
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 6ms/step - accuracy: 0.8929 - loss: 0.2580
Accuracy en el fold: 0.8931
Epoch 1/4
[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m911s[0m 40ms/step - accuracy: 0.7308 - loss: 0.4605 - val_accuracy: 0.8820 - val_loss: 0.2795
Epoch 2/4
[1m22500/22500[