In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [2]:
train_df = pd.read_csv('C:/Users/cesco/Desktop/Personal/UPY/9/NLP/proyecto/train.csv', header=None, names=['polarity', 'summary', 'reviewText'])

Processing

In [3]:
# Descargar las stopwords de NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Cargar los datos (reemplazar con la ruta correcta si es necesario)
train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'summary', 'reviewText'])

# Reemplazar NaN por cadenas vacías en las columnas de texto
train_df['summary'] = train_df['summary'].fillna('')
train_df['reviewText'] = train_df['reviewText'].fillna('')

# Función para preprocesar el texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres no alfabéticos (números, signos de puntuación, etc.)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Eliminar stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Aplicar el preprocesamiento a las columnas de texto
train_df['cleaned_summary'] = train_df['summary'].apply(preprocess_text)
train_df['cleaned_reviewText'] = train_df['reviewText'].apply(preprocess_text)

# Verificar cómo quedaron las reseñas
print(train_df[['summary', 'cleaned_summary', 'reviewText', 'cleaned_reviewText']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cesco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                             summary  \
0                     Stuning even for the non-gamer   
1              The best soundtrack ever to anything.   
2                                           Amazing!   
3                               Excellent Soundtrack   
4  Remember, Pull Your Jaw Off The Floor After He...   

                   cleaned_summary  \
0            stuning even nongamer   
1    best soundtrack ever anything   
2                          amazing   
3             excellent soundtrack   
4  remember pull jaw floor hearing   

                                          reviewText  \
0  This sound track was beautiful! It paints the ...   
1  I'm reading a lot of reviews saying that this ...   
2  This soundtrack is my favorite music of all ti...   
3  I truly like this soundtrack and I enjoy video...   
4  If you've played the game, you know how divine...   

                                  cleaned_reviewText  
0  sound track beautiful paints senery min

Vectorization

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Dividir el dataset en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(train_df['cleaned_reviewText'], train_df['polarity'], test_size=0.2, random_state=42)

# Vectorizar los textos utilizando TFIDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Crear el clasificador de regresión logística
log_reg = LogisticRegression(max_iter=1000)

# Entrenar el modelo
log_reg.fit(X_train_tfidf, y_train)

# Predecir en los datos de prueba
y_pred = log_reg.predict(X_test_tfidf)

# Mostrar el reporte de clasificación
print("Clasificación Regresión Logística")
print(classification_report(y_test, y_pred))


Clasificación Regresión Logística
              precision    recall  f1-score   support

           1       0.87      0.86      0.87    359759
           2       0.86      0.87      0.87    360241

    accuracy                           0.87    720000
   macro avg       0.87      0.87      0.87    720000
weighted avg       0.87      0.87      0.87    720000



In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Codificar las etiquetas en números
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Tokenización del texto
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding de las secuencias
max_len = 80
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Crear el modelo DNN
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compilar el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entrenar el modelo
model.fit(X_train_pad, y_train_encoded, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test_encoded))

# Evaluar el modelo
loss, accuracy = model.evaluate(X_test_pad, y_test_encoded)
print(f"Accuracy del modelo DNN: {accuracy:.4f}")




Epoch 1/5
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2054s[0m 46ms/step - accuracy: 0.7584 - loss: 0.4289 - val_accuracy: 0.8912 - val_loss: 0.2603
Epoch 2/5
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2066s[0m 46ms/step - accuracy: 0.8893 - loss: 0.2653 - val_accuracy: 0.8968 - val_loss: 0.2482
Epoch 3/5
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2161s[0m 48ms/step - accuracy: 0.8941 - loss: 0.2552 - val_accuracy: 0.8991 - val_loss: 0.2432
Epoch 4/5
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2291s[0m 51ms/step - accuracy: 0.8962 - loss: 0.2507 - val_accuracy: 0.9002 - val_loss: 0.2411
Epoch 5/5
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2357s[0m 52ms/step - accuracy: 0.8973 - loss: 0.2477 - val_accuracy: 0.9012 - val_loss: 0.2398
[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 12ms/step - accuracy: 0.9011 - loss: 0.2399
Accuracy del modelo DNN: 0.9012


In [None]:
from sklearn.metrics import classification_report

# Realizar predicciones con el modelo DNN
y_pred_dnn = model.predict(X_test_pad)
y_pred_dnn = (y_pred_dnn > 0.5).astype(int) 
# Mostrar el reporte de clasificación
print("Clasificación DNN")
print(classification_report(y_test_encoded, y_pred_dnn))


[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 11ms/step
Clasificación DNN
              precision    recall  f1-score   support

           0       0.90      0.90      0.90    359759
           1       0.90      0.90      0.90    360241

    accuracy                           0.90    720000
   macro avg       0.90      0.90      0.90    720000
weighted avg       0.90      0.90      0.90    720000

