In [1]:

pip install rarfile

Collecting rarfile
  Downloading rarfile-4.0-py3-none-any.whl (28 kB)
Installing collected packages: rarfile
Successfully installed rarfile-4.0


In [2]:
!pip install --upgrade nltk



In [3]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import random
import time
import os
import rarfile
from IPython.display import Markdown, display
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import FreqDist

#Recursos NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:

# Especifica el nombre del archivo RAR
rar_file_name = 'data.rar'  # Reemplaza con el nombre correcto

try:
    # Intenta extraer los archivos del RAR en el directorio actual
    rar = rarfile.RarFile(rar_file_name)
    rar.extractall()

    # Elimina el archivo RAR después de la extracción
    os.remove(rar_file_name)

    print(f'El archivo {rar_file_name} ha sido eliminado con éxito y se ha extraído el contenido en la carpeta /data.')
except FileNotFoundError:
    # Si el archivo RAR no se encuentra, muestra un mensaje personalizado
    display(Markdown(f'<font color="red"><b>El archivo {rar_file_name} no se encuentra en el directorio actual o no puede ser eliminado.</b></font>'))

El archivo data.rar ha sido eliminado con éxito y se ha extraído el contenido en la carpeta /data.


In [5]:
# Especifica la ruta del archivo CSV
csv_file_path = 'data/corpusMini_df.csv'
# Carga el archivo CSV en un DataFrame
corpusMini_df = pd.read_csv(csv_file_path)
# Muestra las primeras 5 filas del DataFrame
print(corpusMini_df.head())

       reviewerID        asin             reviewerName  helpful  \
0   AN4KLPNB56X3Z  B000MD3MIW                   carrie   [3, 3]   
1   A60I915C5M3JE  B000J3HZWE  Ellen Dawson "seriousb"   [2, 3]   
2  A15Q9YEG1XPEJN  B00I18UVO8         Get What We Give   [3, 6]   
3  A3V9TR2U1KISVK  B0029NGZ5K                Sarah1989  [6, 13]   
4  A3QD59N3M7O7KB  B0015GIPDW                  Xina143   [4, 5]   

                                          reviewText  overall  \
0  I was replacing another petsafe door with this...      1.0   
1  I read all of the reviews before purchasing an...      1.0   
2  I didn't realize when I ordered this product t...      1.0   
3  There is very little meat in this food and the...      1.0   
4  We used this skimmer for a few months, but fou...      1.0   

                                             summary  unixReviewTime  \
0                                      Horrible Door      1340323200   
1                                     Waste of Money      1283

In [6]:
# Obtener la lista de stopwords en inglés de NLTK
stop_words = set(stopwords.words('english'))

# Inicializar el lematizador de palabras
lemmatizer = WordNetLemmatizer()

# Tokenizamos, lematizamos, quitamos stopwords y eliminamos caracteres especiales.
def preprocess_text(text):
    # Eliminar caracteres especiales excepto letras, espacios, "/", ".", "--" y puntos seguidos de dígitos
    text = text.lower()
    text = re.sub(r'\.', ' ', text) #Se han detectado comentarios con valores del estilo chemical.Various o meat.Caramel, por tanto se substituyen puntos por espacios.
    text = re.sub(r'[^a-zA-Z\s/.\-]|(?<!\d)\.(?!\d)|http\S+', '', text) #Se eliminan carácteres extraños antes de lematizar.
    words = re.split(r'[./\-\s]+|--+', text) # Separar por "/", ".", "-" y espacios
    filtered_words = [word for word in words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in filtered_words]
    cleaned_text = ' '.join(lemmatized_words)
    # Eliminar stopwords
    filtered_words = [word for word in lemmatized_words if word not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Eliminar espacios en blanco adicionales
    return cleaned_text.strip()  # Eliminar espacios en blanco al principio y al final

# Aplicar la función de preprocesamiento a la columna 'reviewText'
corpusMini_df['preprocessed_text'] = corpusMini_df['reviewText'].apply(preprocess_text)

# Tokenizar el texto preprocesado en palabras individuales
tokenized_words = [word for text in corpusMini_df['preprocessed_text'] for word in word_tokenize(text)]

In [7]:
# Imprimir algunas filas del DataFrame con el texto preprocesado
for index, row in corpusMini_df.head(6).iterrows():
    print(f"Original Text: {row['reviewText']}")
    print(f"Preprocessed Text: {row['preprocessed_text']}")
    print()


Original Text: I was replacing another petsafe door with this one.  The measurements were the same.  To say the least it did not fit.  I had already cut the inside wall, as directed, and it still didn't fit.  Can't return it since I had technically altered the door.  Very flimy and cheap. Finally put the old door back in and drilled holes in the flap of the new to make it fit my old door. The rest is garbage.  The old doors were much stronger and easier to install.  I would never recommend one of these new models.  It would never last, regardless of the weather.  I would give this a negative number if available.  Don't waste your money.
Preprocessed Text: replace another petsafe door one measurements say least fit already cut inside wall direct still didnt fit cant return since technically alter door flimy cheap finally put old door back drill hole flap new make fit old door rest garbage old doors much stronger easier install would never recommend one new model would never last regardl

In [8]:
# Obtener la cantidad de tokenized words
cantidad_tokenized_words = len(tokenized_words)

# Imprimir la cantidad de tokens
print(f"La cantidad de tokenized words es: {cantidad_tokenized_words}")

La cantidad de tokenized words es: 145374


In [9]:
# Obtener una lista de todas las palabras únicas en el corpus
unique_words = set(tokenized_words)

# Calcular la cardinalidad del vocabulario
vocab_cardinality = len(unique_words)

# Imprimir la cardinalidad del vocabulario (tokens únicos)
print(f"Cardinalidad del Vocabulario: {vocab_cardinality}")


Cardinalidad del Vocabulario: 9702


In [10]:
# Coger muestra del corpus
text_data = ' '.join(corpusMini_df['preprocessed_text'])  # Concatenate preprocessed text from the DataFrame

# Definir las palabras token
words = nltk.word_tokenize(text_data)

# Definir los ngrams
N_values = [2, 3]  # You can add more values for other N-grams

for N in N_values:
    # Generar N-grams
    n_grams = list(ngrams(words, N))

    # Calcular la frecuencia de distribución de los ngrams
    freq_dist = FreqDist(n_grams)

    # Obtener los ngrams más comunes
    most_common_ngrams = freq_dist.most_common(10)  # Change 10 to your desired number

    # Imprimir el resultado
    print(f"Top {N}-grams:")
    for ngram, frequency in most_common_ngrams:
        print(f"{ngram}: {frequency} times")
    print("\n")



Top 2-grams:
('litter', 'box'): 163 times
('dog', 'food'): 154 times
('dog', 'love'): 129 times
('work', 'well'): 109 times
('think', 'would'): 104 times
('dont', 'know'): 95 times
('cat', 'love'): 92 times
('can', 'not'): 87 times
('waste', 'money'): 84 times
('would', 'recommend'): 84 times


Top 3-grams:
('dry', 'dog', 'food'): 18 times
('would', 'recommend', 'product'): 17 times
('dont', 'waste', 'money'): 16 times
('buy', 'another', 'one'): 16 times
('last', 'long', 'time'): 14 times
('clean', 'litter', 'box'): 12 times
('year', 'old', 'cat'): 11 times
('think', 'would', 'try'): 11 times
('two', 'year', 'old'): 10 times
('would', 'recommend', 'anyone'): 10 times




In [11]:
# Obtener las 10 palabras más comunes para sentimiento 0
sentiment_0_reviews = corpusMini_df[corpusMini_df['sentiment'] == 0]
tokenized_words_sentiment_0 = [word for text in sentiment_0_reviews['preprocessed_text'] for word in word_tokenize(text)]
freq_dist_sentiment_0 = FreqDist(tokenized_words_sentiment_0)
common_words_sentiment_0 = freq_dist_sentiment_0.most_common(10)

# Imprimir las 10 palabras más comunes para sentimiento 0
print("10 palabras más comunes para sentimiento 0:")
for word, frequency in common_words_sentiment_0:
    print(f"{word}: {frequency}")

# Obtener las 10 palabras más comunes para sentimiento 1
sentiment_1_reviews = corpusMini_df[corpusMini_df['sentiment'] == 1]
tokenized_words_sentiment_1 = [word for text in sentiment_1_reviews['preprocessed_text'] for word in word_tokenize(text)]
freq_dist_sentiment_1 = FreqDist(tokenized_words_sentiment_1)
common_words_sentiment_1 = freq_dist_sentiment_1.most_common(10)

# Imprimir las 10 palabras más comunes para sentimiento 1
print("10 palabras más comunes para sentimiento 1:")
for word, frequency in common_words_sentiment_1:
    print(f"{word}: {frequency}")



10 palabras más comunes para sentimiento 0:
dog: 1349
cat: 967
get: 905
one: 742
use: 721
would: 668
like: 668
product: 540
buy: 536
make: 513
10 palabras más comunes para sentimiento 1:
dog: 1350
cat: 949
get: 845
use: 768
one: 739
like: 713
love: 575
would: 538
food: 508
work: 480


In [12]:
# Agregar una columna 'processedReview' basada en 'preprocessed_text'
corpusMini_df['processedReview'] = corpusMini_df['preprocessed_text']

# Reemplazar valores vacíos en 'processedReview' con NaN
corpusMini_df['processedReview'] = corpusMini_df['processedReview'].replace('', np.nan)

# Eliminar filas con NaN en 'processedReview'
corpusMini_df = corpusMini_df.dropna(subset=['processedReview'])

# Guardar el DataFrame en un archivo CSV
corpusMini_df.to_csv('/content/data/corpusMini_df2.csv', index=False)


In [13]:

# Acceder a la columna 'processedReview'
nueva_columna = corpusMini_df['processedReview']

# Imprimir las primeras 10 filas de la nueva columna
print(nueva_columna.head(10))


0    replace another petsafe door one measurements ...
1    read review purchase extremely hopeful black l...
2    didnt realize order product product recall las...
3    little meat food toxic chemicals various meat ...
4    use skimmer months find remove little waste in...
5    good purpose bathe material wasnt sturdy enoug...
6    purchase border collie package suggest halti s...
7    review gph version dont know versions one litt...
8    cat buy stop scratch furniture scratch post ho...
9    buy think commercial air pump would better job...
Name: processedReview, dtype: object


Con todo este procesado, deberiamos de tener palabras significativas y el texto completamente limpio, exceptuando alguna falta de ortografía.
Por último aunque sospechemos que dog, cat, etc sean palabras irrelevantes las trataremos en la siguiente actividad, con TfidfVectorizer igual que las outlyers producidos por las faltas ortográficas.