# Deep Learning Model
## Luis Arturo
### A01703572

# Conexión a GPU local

In [5]:
import setuptools
import tensorflow as tf
import pandas as pd
import os

In [6]:
def verify_gpu_setup():
    """Comprehensive verification of TensorFlow GPU setup"""
    # Print TensorFlow version
    print(f"TensorFlow version: {tf.__version__}")
    
    # Check CUDA availability
    print("\nCUDA available:", tf.test.is_built_with_cuda())
    
    # List all available GPUs
    physical_devices = tf.config.list_physical_devices()
    print("\nAvailable devices:")
    for device in physical_devices:
        print(f"  {device.device_type}: {device.name}")
    
    # Check if GPU is available and perform a simple computation
    if tf.test.is_built_with_cuda():
        try:
            # Create some random data
            with tf.device('/GPU:0'):
                a = tf.random.normal([1000, 1000])
                b = tf.random.normal([1000, 1000])
                # Perform matrix multiplication
                c = tf.matmul(a, b)
                # Force execution
                result = c.numpy()
                print("\n✅ Successfully performed computation on GPU!")
                print(f"Matrix multiplication shape: {result.shape}")
        except Exception as e:
            print("\n❌ Error during GPU computation:")
            print(e)
    else:
        print("\n❌ CUDA is not available in this TensorFlow installation")

    # Print GPU memory info
    try:
        gpu = tf.config.list_physical_devices('GPU')[0]
        print("\nGPU Device:", gpu)
    except IndexError:
        print("\n❌ No GPU devices found")

verify_gpu_setup()

TensorFlow version: 2.18.0

CUDA available: True

Available devices:
  CPU: /physical_device:CPU:0

✅ Successfully performed computation on GPU!
Matrix multiplication shape: (1000, 1000)

❌ No GPU devices found


W0000 00:00:1730428731.716877    6613 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
# Set TF logging level to avoid unnecessary messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# ETL

## Carga de datos

In [8]:
# Configuración para mostrar todas las columnas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [9]:
# 1. EXTRACT - Carga de datos
print("=== CARGANDO DATASETS ===")
games_df = pd.read_csv('data/games.csv')
users_df = pd.read_csv('data/users.csv')
recommendations_df = pd.read_csv('data/recommendations.csv')

print("=== DATASETS CARGADOS ===")

=== CARGANDO DATASETS ===
=== DATASETS CARGADOS ===


## Información general del dataset

In [10]:
print("\nDimensiones del games_df:", games_df.shape)
print("\nColumnas disponibles del games_df:")
for col in games_df.columns:
    print(f"- {col}")


Dimensiones del games_df: (50872, 13)

Columnas disponibles del games_df:
- app_id
- title
- date_release
- win
- mac
- linux
- rating
- positive_ratio
- user_reviews
- price_final
- price_original
- discount
- steam_deck


In [11]:
print("\nDimensiones del users_df:", users_df.shape)
print("\nColumnas disponibles del users_df:")
for col in users_df.columns:
    print(f"- {col}")


Dimensiones del users_df: (14306064, 3)

Columnas disponibles del users_df:
- user_id
- products
- reviews


In [12]:
print("\nDimensiones del recommendations_df:", recommendations_df.shape)
print("\nColumnas disponibles del recommendations_df:")
for col in recommendations_df.columns:
    print(f"- {col}")


Dimensiones del recommendations_df: (41154794, 8)

Columnas disponibles del recommendations_df:
- app_id
- helpful
- funny
- date
- is_recommended
- hours
- user_id
- review_id


### Análisis inicial de games

In [13]:
print("\nPrimeras 5 filas de games_df:")
print(games_df.head())
print("\nInformación del dataset de juegos:")
print(games_df.info())
print("\nEstadísticas descriptivas de games_df:")
print(games_df.describe())
print("\nValores nulos en games_df:")
print(games_df.isnull().sum())


Primeras 5 filas de games_df:
   app_id                              title date_release   win    mac  linux  \
0   13500  Prince of Persia: Warrior Within™   2008-11-21  True  False  False   
1   22364            BRINK: Agents of Change   2011-08-03  True  False  False   
2  113020       Monaco: What's Yours Is Mine   2013-04-24  True   True   True   
3  226560                 Escape Dead Island   2014-11-18  True  False  False   
4  249050            Dungeon of the ENDLESS™   2014-10-27  True   True  False   

          rating  positive_ratio  user_reviews  price_final  price_original  \
0  Very Positive              84          2199         9.99            9.99   
1       Positive              85            21         2.99            2.99   
2  Very Positive              92          3722        14.99           14.99   
3          Mixed              61           873        14.99           14.99   
4  Very Positive              88          8784        11.99           11.99   

   disc

### Análisis inicial de users

In [14]:
print("\nPrimeras 5 filas de users_df:")
print(users_df.head())
print("\nInformación del dataset de usuarios:")
print(users_df.info())
print("\nEstadísticas descriptivas de users_df:")
print(users_df.describe())
print("\nValores nulos en users_df:")
print(users_df.isnull().sum())


Primeras 5 filas de users_df:
    user_id  products  reviews
0   7360263       359        0
1  14020781       156        1
2   8762579       329        4
3   4820647       176        4
4   5167327        98        2

Información del dataset de usuarios:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14306064 entries, 0 to 14306063
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   products  int64
 2   reviews   int64
dtypes: int64(3)
memory usage: 327.4 MB
None

Estadísticas descriptivas de users_df:
            user_id      products       reviews
count  1.430606e+07  1.430606e+07  1.430606e+07
mean   7.153032e+06  1.163734e+02  2.876738e+00
std    4.129805e+06  2.438515e+02  7.987421e+00
min    0.000000e+00  0.000000e+00  0.000000e+00
25%    3.576516e+06  2.300000e+01  1.000000e+00
50%    7.153032e+06  5.500000e+01  1.000000e+00
75%    1.072955e+07  1.270000e+02  3.000000e+00
max    1.430606e+07  3.221400e+04  6.045000e+03

Valores 

### Análisis inicial de recommendations

In [15]:
print("\nPrimeras 5 filas de recommendations_df:")
print(recommendations_df.head())
print("\nInformación del dataset de recomendaciones:")
print(recommendations_df.info())
print("\nEstadísticas descriptivas de recommendations_df:")
print(recommendations_df.describe())
print("\nValores nulos en recommendations_df:")
print(recommendations_df.isnull().sum())


Primeras 5 filas de recommendations_df:
    app_id  helpful  funny        date  is_recommended  hours  user_id  \
0   975370        0      0  2022-12-12            True   36.3    51580   
1   304390        4      0  2017-02-17           False   11.5     2586   
2  1085660        2      0  2019-11-17            True  336.5   253880   
3   703080        0      0  2022-09-23            True   27.4   259432   
4   526870        0      0  2021-01-10            True    7.9    23869   

   review_id  
0          0  
1          1  
2          2  
3          3  
4          4  

Información del dataset de recomendaciones:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41154794 entries, 0 to 41154793
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7  

### Análisis expecífico de features importantes

In [16]:
# Análisis de ratings
print("\nDistribución de ratings en juegos:")
print(games_df['rating'].value_counts())


Distribución de ratings en juegos:
rating
Positive                   13502
Very Positive              13139
Mixed                      12157
Mostly Positive             8738
Mostly Negative             1849
Overwhelmingly Positive     1110
Negative                     303
Very Negative                 60
Overwhelmingly Negative       14
Name: count, dtype: int64


In [17]:
# Análisis de precios
print("\nEstadísticas de precios:")
print(games_df[['price_final', 'price_original']].describe())


Estadísticas de precios:
        price_final  price_original
count  50872.000000    50872.000000
mean       8.620325        8.726788
std       11.514164       11.507021
min        0.000000        0.000000
25%        0.990000        0.990000
50%        4.990000        4.990000
75%       10.990000       11.990000
max      299.990000      299.990000


In [18]:
# Análisis de recomendaciones
print("\nDistribución de recomendaciones:")
print(recommendations_df['is_recommended'].value_counts(normalize=True))


Distribución de recomendaciones:
is_recommended
True     0.857844
False    0.142156
Name: proportion, dtype: float64


In [19]:
# Análisis de horas jugadas
print("\nEstadísticas de horas jugadas:")
print(recommendations_df['hours'].describe())


Estadísticas de horas jugadas:
count    4.115479e+07
mean     1.006022e+02
std      1.761675e+02
min      0.000000e+00
25%      7.800000e+00
50%      2.730000e+01
75%      9.920000e+01
max      1.000000e+03
Name: hours, dtype: float64


### Verificamos la integridad de referencias entre los datasets
Nos aseguramos de que los juegos y los usuarios referenciados en reseñas existen en el dataset de juegos y en el de usuarios

In [20]:
# Verificar si todos los app_id en recommendations existen en games
games_apps = set(games_df['app_id'])
recommendations_apps = set(recommendations_df['app_id'])
print("\nJuegos en recommendations pero no en games:", len(recommendations_apps - games_apps))


Juegos en recommendations pero no en games: 0


In [21]:
# Verificar si todos los user_id en recommendations existen en users
users_ids = set(users_df['user_id'])
recommendations_users = set(recommendations_df['user_id'])
print("Usuarios en recommendations pero no en users:", len(recommendations_users - users_ids))

Usuarios en recommendations pero no en users: 0


## Se va a crear un modelo capaz de predecir el rating que tendrá un juego ("Very Positive", "Mixed", etc.) 
## Son 9 clases 
- Positive                   13502
- Very Positive              13139
- Mixed                      12157
- Mostly Positive             8738
- Mostly Negative             1849
- Overwhelmingly Positive     1110
- Negative                     303
- Very Negative                 60
- Overwhelmingly Negative       14

## Features de entrada
- Precio
- Plataformas soportadas (win, mac, linux)
- Tiempo en el mercado
- Descuentos
- Métricas agregadas de reviews (horas promedio jugadas, ratio de recomendaciones)

## Preprocesamiendo de los datos

In [23]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

### Función para juntar el dataset de recommendations y el de games

In [24]:
def enrich_games_data(games_df, recommendations_df):
    """
    Enriquece el dataset de juegos con métricas agregadas de recomendaciones
    """
    # Calculamos métricas agregadas por juego
    agg_metrics = recommendations_df.groupby('app_id').agg({
        'hours': ['mean', 'median', 'std'],
        'is_recommended': 'mean',
        'helpful': 'mean',
        'funny': 'mean'
    }).round(3)
    
    # Aplanamos los nombres de las columnas
    agg_metrics.columns = [
        'avg_hours', 'median_hours', 'std_hours',
        'recommendation_ratio', 'avg_helpful', 'avg_funny'
    ]
    
    # Mergeamos con el dataset original de juegos
    enriched_games = games_df.merge(
        agg_metrics, 
        left_on='app_id', 
        right_index=True, 
        how='left'
    )
    
    # Llenamos valores NaN (juegos sin recomendaciones)
    enriched_games = enriched_games.fillna({
        'avg_hours': 0,
        'median_hours': 0,
        'std_hours': 0,
        'recommendation_ratio': 0,
        'avg_helpful': 0,
        'avg_funny': 0
    })
    
    return enriched_games

### Creamos al preprocesador

In [None]:
class GameRatingPreprocessor:
    def __init__(self, max_title_length=10):
        # Inicializamos encoders y tokenizers
        self.title_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
        self.developer_encoder = LabelEncoder()
        self.publisher_encoder = LabelEncoder()
        self.rating_encoder = LabelEncoder()
        self.scalers = {}
        
        # Parámetros de configuración
        self.max_title_length = max_title_length
        self.vocab_size = None
        self.num_developers = None
        self.num_publishers = None
        self.num_ratings = None
        
    def clean_text(self, text):
        """Limpia y normaliza el texto del título"""
        # Convertimos el texto a minúsculas
        text = text.lower()
        # Quitamos cualquier carater especial pero dejamos los espacios
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text.strip()
    
    def process_numerical_features(self, df):
        """Procesa y escala características numéricas"""
        numerical_features = [
            'price_final', 
            'price_original', 
            'discount',
            'positive_ratio',
            'user_reviews'
        ]
        
        # Crear features adicionales con manejo de infinitos
        # Reemplazar infinitos o valores muy grandes con NaN y luego con 1.0
        df_copy['price_ratio'] = (df_copy['price_final'] / df_copy['price_original']).replace([np.inf, -np.inf], np.nan).fillna(1.0)
        df_copy['platform_count'] = df_copy['win'].astype(int) + df_copy['mac'].astype(int) + df_copy['linux'].astype(int)
        
        # Agregar features derivados
        numerical_features.extend(['price_ratio', 'platform_count'])
        
        # Escalar features numéricos
        X_numerical = df[numerical_features].copy()
        for column in numerical_features:
            scaler = StandardScaler()
            X_numerical[column] = scaler.fit_transform(X_numerical[[column]])
            self.scalers[column] = scaler
            
        return X_numerical
    
    def process_categorical_features(self, df):
        """Procesa características categóricas y crea embeddings"""
        # Procesar títulos
        cleaned_titles = df['title'].apply(self.clean_text)
        self.title_tokenizer.fit_on_texts(cleaned_titles)
        self.vocab_size = len(self.title_tokenizer.word_index) + 1
        title_sequences = self.title_tokenizer.texts_to_sequences(cleaned_titles)
        X_titles = pad_sequences(title_sequences, maxlen=self.max_title_length)
        
        # Procesar desarrolladores
        developers = df['developer'].apply(eval)
        primary_developers = developers.apply(lambda x: x[0] if x else 'Unknown')
        X_developers = self.developer_encoder.fit_transform(primary_developers)
        self.num_developers = len(self.developer_encoder.classes_)
        
        # Procesar publishers
        publishers = df['publisher'].apply(eval)
        primary_publishers = publishers.apply(lambda x: x[0] if x else 'Unknown')
        X_publishers = self.publisher_encoder.fit_transform(primary_publishers)
        self.num_publishers = len(self.publisher_encoder.classes_)
        
        return X_titles, X_developers, X_publishers
    
    def process_target(self, ratings):
        """Procesa la variable objetivo (ratings)"""
        # Crear mapping ordenado de ratings
        rating_order = [
            'Overwhelmingly Negative',
            'Very Negative',
            'Negative',
            'Mostly Negative',
            'Mixed',
            'Mostly Positive',
            'Positive',
            'Very Positive',
            'Overwhelmingly Positive'
        ]
        
        # Encodificar ratings manteniendo el orden
        self.rating_encoder.fit(rating_order)
        y = self.rating_encoder.transform(ratings)
        self.num_ratings = len(rating_order)
        
        return y
    
    def prepare_data(self, games_df, test_size=0.2, val_size=0.2):
        """Prepara todos los datos para el entrenamiento"""
        # Procesar features
        X_numerical = self.process_numerical_features(games_df)
        print("ola k ase")
        X_titles, X_developers, X_publishers = self.process_categorical_features(games_df)
        y = self.process_target(games_df['rating'])
        
        # Dividir los datos
        indices = np.arange(len(games_df))
        indices_train_val, indices_test = train_test_split(
            indices, test_size=test_size, random_state=42, stratify=y
        )
        indices_train, indices_val = train_test_split(
            indices_train_val, test_size=val_size, random_state=42, 
            stratify=y[indices_train_val]
        )
        
        # Función helper para dividir datos
        def split_data(X, indices_train, indices_val, indices_test):
            return (
                X[indices_train], 
                X[indices_val], 
                X[indices_test]
            )
        
        # Preparar todos los conjuntos de datos
        return {
            'numerical': split_data(X_numerical.values, indices_train, indices_val, indices_test),
            'titles': split_data(X_titles, indices_train, indices_val, indices_test),
            'developers': split_data(X_developers, indices_train, indices_val, indices_test),
            'publishers': split_data(X_publishers, indices_train, indices_val, indices_test),
            'target': split_data(y, indices_train, indices_val, indices_test)
        }
    
    def get_feature_dims(self):
        """Retorna las dimensiones de las features para el modelo"""
        return {
            'vocab_size': self.vocab_size,
            'num_developers': self.num_developers,
            'num_publishers': self.num_publishers,
            'num_numerical': len(self.scalers),
            'num_ratings': self.num_ratings
        }

### Preprocesamos el nuevo dataset enriquecido

In [31]:
# Enriquecemos el dataset de juegos
enriched_games = enrich_games_data(games_df, recommendations_df)

# Creamos el preprocesador
preprocessor = GameRatingPreprocessor()

# Preparamos los datos
prepared_data = preprocessor.prepare_data(enriched_games)

ValueError: Input X contains infinity or a value too large for dtype('float64').

In [28]:
enriched_games.info

<bound method DataFrame.info of         app_id                                      title date_release   win  \
0        13500          Prince of Persia: Warrior Within™   2008-11-21  True   
1        22364                    BRINK: Agents of Change   2011-08-03  True   
2       113020               Monaco: What's Yours Is Mine   2013-04-24  True   
3       226560                         Escape Dead Island   2014-11-18  True   
4       249050                    Dungeon of the ENDLESS™   2014-10-27  True   
...        ...                                        ...          ...   ...   
50867  2296380  I Expect You To Die 3: Cog in the Machine   2023-09-28  True   
50868  1272080                                   PAYDAY 3   2023-09-21  True   
50869  1402110                                 Eternights   2023-09-11  True   
50870  2272250                        Forgive Me Father 2   2023-10-19  True   
50871  2488510                                  FatalZone   2023-10-23  True   

       

In [None]:
# Verificamos las dimensiones y estructura de los datos
print("\nDimensiones de los datos preparados:")
for key, (train, val, test) in prepared_data.items():
    print(f"{key}:")
    print(f"  Train: {train.shape}")
    print(f"  Validation: {val.shape}")
    print(f"  Test: {test.shape}")

print("\nDimensiones de features para el modelo:")
for key, value in feature_dims.items():
    print(f"{key}: {value}")