In [16]:
import pandas as pd
import ast  
import numpy as np
import re
import nltk
import json
import gzip #Importar y descomprimir
from sqlalchemy import create_engine            # Se usa para acceso a MYSQL engine
from sqlalchemy.exc import SQLAlchemyError      # Se usa para determinar errores en el proceso de carga


In [None]:
# Cargar el archivo .json.gz que contiene los datos en formato de diccionario
with gzip.open('/Data/users_items.json.gz', 'rt', encoding='utf-8') as file:
    data = []
    for line in file:
        try:
            data.append(ast.literal_eval(line.strip()))  # Convertir de string a diccionario, se usa literal_eval ya que acepta ' para representar cadenas
        except ValueError as e:
            print(f"Error decoding line: {line}\nError: {e}")

In [5]:
# Cargar los datos al DF de pandas
df = pd.DataFrame(data)
print(df.head())

             user_id  items_count           steam_id  \
0  76561197970982479          277  76561197970982479   
1            js41637          888  76561198035864385   
2          evcentric          137  76561198007712555   
3         Riot-Punch          328  76561197963445855   
4              doctr          541  76561198002099482   

                                            user_url  \
0  http://steamcommunity.com/profiles/76561197970...   
1               http://steamcommunity.com/id/js41637   
2             http://steamcommunity.com/id/evcentric   
3            http://steamcommunity.com/id/Riot-Punch   
4                 http://steamcommunity.com/id/doctr   

                                               items  
0  [{'item_id': '10', 'item_name': 'Counter-Strik...  
1  [{'item_id': '10', 'item_name': 'Counter-Strik...  
2  [{'item_id': '1200', 'item_name': 'Red Orchest...  
3  [{'item_id': '10', 'item_name': 'Counter-Strik...  
4  [{'item_id': '300', 'item_name': 'Day of Defea..

In [6]:
# Se eliminan colunmnas inecesarias para analisis
df = df.drop(columns=['user_url', 'steam_id'])

In [7]:
items_list = []

# Iterar sobre cada fila del DataFrame original
for index, row in df.iterrows():
    user_id = row['user_id']
    items = row['items']
    
    if isinstance(items, list):
        for item in items:
            if isinstance(item, dict):
                items_list.append({
                    'user_id': user_id,
                    'item_id': item['item_id'],
                    'playtime_forever': item['playtime_forever'],
                    'playtime_2weeks': item['playtime_2weeks']
                })

# Crear un nuevo DataFrame a partir de la lista de diccionarios
df = pd.DataFrame(items_list)

# Imprimir el nuevo DataFrame
print(df)

                   user_id item_id  playtime_forever  playtime_2weeks
0        76561197970982479      10                 6                0
1        76561197970982479      20                 0                0
2        76561197970982479      30                 7                0
3        76561197970982479      40                 0                0
4        76561197970982479      50                 0                0
...                    ...     ...               ...              ...
5153204  76561198329548331  346330                 0                0
5153205  76561198329548331  373330                 0                0
5153206  76561198329548331  388490                 3                3
5153207  76561198329548331  521570                 4                4
5153208  76561198329548331  519140                 3                3

[5153209 rows x 4 columns]


In [8]:
# Limpia los duplicados de las dos claves
df = df.drop_duplicates(subset=['user_id', 'item_id'])

In [9]:
# Convertir datos a CSV file
df.to_csv('API/Datos/Items.csv.gz', index=False, compression='gzip')


In [17]:
# Configura tu conexión a MySQL
user = 'camilo'
password = 'camilo_password'
host = '35.226.92.249'
port = '3306'
database = 'steam'
table_name = 'user_playtime'

# Supón que 'df' es tu DataFrame ya preparado
# Asegúrate de que user_id e item_id sean strings
df['user_id'] = df['user_id'].astype(str)
df['item_id'] = df['item_id'].astype(str)

# Comprobar que el DataFrame no esté vacío
if df.empty:
    print("El DataFrame está vacío. No se puede insertar en la base de datos.")
else:
    print(f"Total rows in items_df: {len(df)}")

# Crea la conexión a la base de datos
engine = create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}')

# Insertar el DataFrame a MySQL en bloques
chunk_size = 50000  

try:
    with engine.connect() as connection:
        trans = connection.begin()  # Inicia la transacción
        try:
            for i in range(0, len(df), chunk_size):
                chunk = df[i:i + chunk_size]
                if not chunk.empty:
                    chunk.to_sql(name=table_name, con=connection, if_exists='append', index=False)
                    print(f'Inserted rows from {i} to {i + len(chunk) - 1}')

            trans.commit()  # Confirma la transacción
            print("Transaction committed successfully.")
        except Exception as e:
            print(f"An error occurred during the transaction: {e}")
            trans.rollback()  # Deshacer cambios si ocurre un error
except SQLAlchemyError as e:
    print(f"An error occurred while connecting to the database: {e}")

# Verifica el conteo de filas en la tabla después de la inserción
try:
    with engine.connect() as connection:
        result = connection.execute(f'SELECT COUNT(*) FROM {table_name}')
        count = result.fetchone()[0]
        print(f'Total rows in {table_name}: {count}')
except SQLAlchemyError as e:
    print(f"An error occurred while counting rows: {e}")

Total rows in items_df: 5094082
Inserted rows from 0 to 49999
Inserted rows from 50000 to 99999
Inserted rows from 100000 to 149999
Inserted rows from 150000 to 199999
Inserted rows from 200000 to 249999
Inserted rows from 250000 to 299999
Inserted rows from 300000 to 349999
Inserted rows from 350000 to 399999
Inserted rows from 400000 to 449999
Inserted rows from 450000 to 499999
Inserted rows from 500000 to 549999
Inserted rows from 550000 to 599999
Inserted rows from 600000 to 649999
Inserted rows from 650000 to 699999
Inserted rows from 700000 to 749999
Inserted rows from 750000 to 799999
Inserted rows from 800000 to 849999
Inserted rows from 850000 to 899999
Inserted rows from 900000 to 949999
Inserted rows from 950000 to 999999
Inserted rows from 1000000 to 1049999
Inserted rows from 1050000 to 1099999
Inserted rows from 1100000 to 1149999
Inserted rows from 1150000 to 1199999
Inserted rows from 1200000 to 1249999
Inserted rows from 1250000 to 1299999
Inserted rows from 1300000 t