In [1]:
import pandas as pd
import ast  
import numpy as np
import re
import nltk
import gzip
from sqlalchemy import create_engine

In [None]:
# Abrir y leer el archivo .json.gz
with gzip.open('/Data/user_reviews.json.gz', 'rt', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(ast.literal_eval(line.strip()))  # Convertir de string a diccionario



In [3]:
# Eliminar la columna 'user_url' y expandir los datos de 'reviews'
processed_data = []
for entry in data:
    user_id = entry['user_id']
    for review in entry['reviews']:
        review['user_id'] = user_id  # Agregar user_id a cada review
        processed_data.append(review)

# Crear un DataFrame de pandas
df = pd.DataFrame(processed_data)

print (df)

                                  funny                     posted  \
0                                         Posted November 5, 2011.   
1                                            Posted July 15, 2011.   
2                                           Posted April 21, 2011.   
3                                            Posted June 24, 2014.   
4                                        Posted September 8, 2013.   
...                                 ...                        ...   
59300                                              Posted July 10.   
59301                                               Posted July 8.   
59302  1 person found this review funny             Posted July 3.   
59303                                              Posted July 20.   
59304                                               Posted July 2.   

      last_edited item_id                                          helpful  \
0                    1250                                   No ratings yet   
1  

In [4]:

# Eliminar la palabra 'posted' al inicio y luego los espacios o comillas simples al inicio y final
df['posted'] = df['posted'].str.replace('Posted', '', regex=True).str.strip(" '")

# Imprimir el DataFrame para verificar los cambios
print(df['posted'])

0         November 5, 2011.
1            July 15, 2011.
2           April 21, 2011.
3            June 24, 2014.
4        September 8, 2013.
                ...        
59300              July 10.
59301               July 8.
59302               July 3.
59303              July 20.
59304               July 2.
Name: posted, Length: 59305, dtype: object


In [5]:

# Eliminar la frase 'people found this review funny' y luego los espacios en blanco
df['funny'] = df['funny'].str.replace('people found this review funny', '', regex=False).str.strip()

# Imprimir el DataFrame para verificar los cambios
print(df['funny'])

0                                        
1                                        
2                                        
3                                        
4                                        
                       ...               
59300                                    
59301                                    
59302    1 person found this review funny
59303                                    
59304                                    
Name: funny, Length: 59305, dtype: object


In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Asegúrate de descargar los recursos requeridos de NLTK solo una vez
nltk.download('vader_lexicon')

# Inicializa el analizador de sentimientos
sia = SentimentIntensityAnalyzer()

def get_sentiment(review):
    # Convertir cualquier valor a cadena
    review = str(review)
    
    if pd.isnull(review) or review.strip() == "":
        return 1  # Valor neutral si no hay reseña
    sentiment_score = sia.polarity_scores(review)['compound']
    if sentiment_score < -0.05:
        return 0  # Malo
    elif sentiment_score > 0.05:
        return 2  # Positivo
    else:
        return 1  # Neutral

# Aplica la función a la columna de reseñas y crea la nueva columna en el DataFrame existente
df['sentiment_analysis'] = df['review'].apply(get_sentiment)

# Elimina la columna review
df = df.drop(columns=['review'])

# Muestra las primeras filas del DataFrame actualizado
print(df.head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kcasi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


  funny              posted last_edited item_id  \
0         November 5, 2011.                1250   
1            July 15, 2011.               22200   
2           April 21, 2011.               43110   
3            June 24, 2014.              251610   
4        September 8, 2013.              227300   

                                           helpful  recommend  \
0                                   No ratings yet       True   
1                                   No ratings yet       True   
2                                   No ratings yet       True   
3  15 of 20 people (75%) found this review helpful       True   
4     0 of 1 people (0%) found this review helpful       True   

             user_id  sentiment_analysis  
0  76561197970982479                   2  
1  76561197970982479                   2  
2  76561197970982479                   2  
3            js41637                   2  
4            js41637                   2  


In [7]:

# Función para reemplazar los valores de la columna 'helpful'
def replace_helpful_values(value):
    if value == 'No ratings yet':
        return 0
    # Busca un patrón de porcentaje en la cadena
    match = re.search(r'\((\d+)%\)', value)
    if match:
        return int(match.group(1))  # Devuelve el número encontrado
    else:
        return np.nan  # Valor por defecto si no se encuentra una coincidencia

# Aplica la función a la columna 'helpful'
df['helpful'] = df['helpful'].apply(replace_helpful_values)

# Asegúrate de que los valores sean enteros
df['helpful'] = df['helpful'].fillna(0).astype(int)


df['id_review'] = range(1, len(df) + 1)  # Esto comenzará desde 1


In [8]:
# Eliminar columnas inecesarias del DataFrame
df = df.drop(columns=['posted', 'last_edited','funny'])


In [9]:

print(df)

      item_id  helpful  recommend            user_id  sentiment_analysis  \
0        1250        0       True  76561197970982479                   2   
1       22200        0       True  76561197970982479                   2   
2       43110        0       True  76561197970982479                   2   
3      251610       75       True            js41637                   2   
4      227300        0       True            js41637                   2   
...       ...      ...        ...                ...                 ...   
59300      70        0       True  76561198312638244                   2   
59301  362890        0       True  76561198312638244                   2   
59302  273110       50       True        LydiaMorley                   2   
59303     730        0       True        LydiaMorley                   2   
59304     440        0       True        LydiaMorley                   2   

       id_review  
0              1  
1              2  
2              3  
3          

In [None]:
from dotenv import load_dotenv
import os

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Conexión a la base de datos MySQL
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
database = os.getenv('DB_NAME')

# URL de conexión
connection_string = f'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}'
engine = create_engine(connection_string)

# Escribir el DataFrame en la tabla de MySQL
df.to_sql('reviews', con=engine, index=False, if_exists='append') 

print("Datos subidos exitosamente a MySQL.")

Datos subidos exitosamente a MySQL.


In [15]:
# Guardar el archivo en csv comprimido
df.to_csv('API/Datos/reviews.csv.gz', index=False, compression='gzip')