In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Descargar recursos necesarios de NLTK
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df_subset = pd.read_csv('/content/drive/My Drive/NLP/datos_explorados.csv')

In [4]:
print(df_subset['text'].apply(type).value_counts())

text
<class 'str'>    8000
Name: count, dtype: int64


In [6]:
def preprocess_text(text):
    """
    Función principal que realiza todo el preprocesamiento del texto.
    """
    text = remove_html(text)
    text = convert_to_lowercase(text)
    text = expand_contractions(text)  # Manejo de contracciones
    text = remove_special_characters(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_words(tokens)
    processed_text = ' '.join(tokens)
    return processed_text

def remove_html(text):
    """Elimina etiquetas HTML y separa palabras concatenadas."""
    # Eliminar etiquetas HTML y reemplazar por espacio
    text = re.sub(r'<.*?>', ' ', text)
    # Reemplazar caracteres no alfabéticos por espacio
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Reemplazar múltiples espacios en blanco por uno solo
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def convert_to_lowercase(text):
    """Convierte el texto a minúsculas."""
    return text.lower()

def expand_contractions(text):
    """Expande contracciones comunes."""
    contractions_dict = {
        "don't": "do not",
        "can't": "cannot",
        "I've": "I have",
        "you're": "you are",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "we're": "we are",
        "they're": "they are",
        "wasn't": "was not",
        "weren't": "were not",
        "isn't": "is not",
        "aren't": "are not",
        "i'm": "I am",
        "let's": "let us",
        "that's": "that is",

    }

    # Reemplazar contracciones en el texto
    for contraction, expanded in contractions_dict.items():
        text = re.sub(r'\b' + contraction + r'\b', expanded, text, flags=re.IGNORECASE)

    return text

def remove_special_characters(text):
    """Elimina caracteres especiales y números."""
    return re.sub(r'[^a-zA-Z\s]', '', text)

def tokenize_text(text):
    """Tokeniza el texto."""
    return word_tokenize(text)

def remove_stopwords(tokens):
    """Elimina stopwords."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def lemmatize_words(tokens):
    """Lematiza las palabras."""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Aplicar el preprocesamiento a la columna de texto
df_subset['processed_text'] = df_subset['text'].apply(preprocess_text)

# Verificar los resultados
print(df_subset[['text', 'processed_text']].head())

                                                text  \
0  This is a very warped story about the author/B...   
1  Grandma bought this book at a brick and mortar...   
2  worst woods book i've read. and i have read th...   
3  Boring and flat.  Couldn't wait to finish it t...   
4  Don't waste your money on this deeply unhelpfu...   

                                      processed_text  
0  warped story author brenner sex dolphin yes do...  
1  grandma bought book brick mortar store feel li...  
2  worst wood book read read believe stuart wood ...  
3       boring flat wait finish start something else  
4  waste money deeply unhelpful book unless tryin...  


In [9]:
# Verificar filas con cadenas vacías en processed_text
empty_strings = df_subset[df_subset['processed_text'] == '']
print("Filas con cadenas vacías en processed_text:")
print(empty_strings)

Filas con cadenas vacías en processed_text:
Empty DataFrame
Columns: [text, rating, processed_text]
Index: []


In [10]:
df_subset = df_subset[df_subset['processed_text'] != '']

El preprocesamiento ha sido exitoso, resultando en una columna 'processed_text' que refleja una limpieza efectiva del contenido original. Las etiquetas HTML han sido eliminadas y el texto ha sido convertido a minúsculas, facilitando su análisis. Además, se han expandido las contracciones comunes, lo que mejora la comprensión. Se han eliminado las stopwords, permitiendo que el análisis se enfoque en términos más significativos. La lematización ha reducido palabras como "woods" a "wood", ayudando a agrupar términos relacionados. En general, el preprocesamiento proporciona una base sólida para el análisis de sentimiento y modelización, aunque se recomienda revisar la lista de stopwords y considerar ajustes adicionales para mejorar la calidad del texto procesado.

In [11]:
# Guardar como CSV en Google Drive
df_subset.to_csv('/content/drive/My Drive/NLP/datos_procesados.csv', index=False)