# Deep Learning Model
## Luis Arturo
### A01703572

# Imports

In [5]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import setuptools
import tensorflow as tf
from datetime import datetime
import pandas as pd
import sys
import platform
import os
import subprocess

In [6]:
def setup_tensorflow_wsl2():
    """Configure TensorFlow for WSL2 environment"""
    
    print("=== System Information ===")
    print(f"Python version: {platform.python_version()}")
    print(f"TensorFlow version: {tf.__version__}")
    print(f"Platform: {platform.platform()}")
    print(f"WSL version: {subprocess.getoutput('wsl.exe --version')}")
    
    # Configure memory growth
    try:
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("\n✅ GPU memory growth enabled")
        print(f"\nGPUs available: {gpus}")
    except Exception as e:
        print(f"\n⚠️ Error configuring GPU: {e}")
    
    # Set TensorFlow logging level
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Reduce logging output
    
    print("\n=== NVIDIA Setup ===")
    try:
        nvidia_smi = subprocess.getoutput('nvidia-smi')
        print("NVIDIA-SMI output:")
        print(nvidia_smi)
    except Exception as e:
        print(f"⚠️ Error getting NVIDIA info: {e}")
    
    print("\n=== Testing GPU Availability ===")
    try:
        with tf.device('/GPU:0'):
            a = tf.random.normal([1000, 1000])
            b = tf.random.normal([1000, 1000])
            c = tf.matmul(a, b)
            print("✅ Successfully performed GPU computation")
            print(f"Computation device: {c.device}")
    except Exception as e:
        print(f"⚠️ GPU computation failed: {e}")
    
    return gpus

# Run setup and store GPU information
gpu_devices = setup_tensorflow_wsl2()

=== System Information ===
Python version: 3.12.3
TensorFlow version: 2.18.0
Platform: Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.39
WSL version: W S L   v e r s i o n :   2 . 3 . 2 4 . 0 
 
 K e r n e l   v e r s i o n :   5 . 1 5 . 1 5 3 . 1 - 2 
 
 W S L g   v e r s i o n :   1 . 0 . 6 5 
 
 M S R D C   v e r s i o n :   1 . 2 . 5 6 2 0 
 
 D i r e c t 3 D   v e r s i o n :   1 . 6 1 1 . 1 - 8 1 5 2 8 5 1 1 
 
 D X C o r e   v e r s i o n :   1 0 . 0 . 2 6 1 0 0 . 1 - 2 4 0 3 3 1 - 1 4 3 5 . g e - r e l e a s e 
 
 W i n d o w s   v e r s i o n :   1 0 . 0 . 2 2 6 3 1 . 4 3 1 7 
 
 

GPUs available: []

=== NVIDIA Setup ===


W0000 00:00:1730411579.698210    7520 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


NVIDIA-SMI output:
Thu Oct 31 15:52:59 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.02              Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   50C    P0             22W /  119W |       0MiB /   8188MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                             

# ETL

### Carga de datos

In [7]:
# Configuración para mostrar todas las columnas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [8]:
# 1. EXTRACT - Carga de datos
print("=== CARGANDO DATASETS ===")
games_df = pd.read_csv('data/games.csv')
users_df = pd.read_csv('data/users.csv')
recommendations_df = pd.read_csv('data/recommendations.csv')

print("=== DATASETS CARGADOS ===")

=== CARGANDO DATASETS ===
=== DATASETS CARGADOS ===


### Información general del dataset

In [9]:
print("\nDimensiones del games_df:", games_df.shape)
print("\nColumnas disponibles del games_df:")
for col in games_df.columns:
    print(f"- {col}")


Dimensiones del games_df: (50872, 13)

Columnas disponibles del games_df:
- app_id
- title
- date_release
- win
- mac
- linux
- rating
- positive_ratio
- user_reviews
- price_final
- price_original
- discount
- steam_deck


In [10]:
print("\nDimensiones del users_df:", users_df.shape)
print("\nColumnas disponibles del users_df:")
for col in users_df.columns:
    print(f"- {col}")


Dimensiones del users_df: (14306064, 3)

Columnas disponibles del users_df:
- user_id
- products
- reviews


In [11]:
print("\nDimensiones del recommendations_df:", recommendations_df.shape)
print("\nColumnas disponibles del recommendations_df:")
for col in recommendations_df.columns:
    print(f"- {col}")


Dimensiones del recommendations_df: (41154794, 8)

Columnas disponibles del recommendations_df:
- app_id
- helpful
- funny
- date
- is_recommended
- hours
- user_id
- review_id


### Análisis inicial de games

In [12]:
print("\nPrimeras 5 filas de games_df:")
print(games_df.head())
print("\nInformación del dataset de juegos:")
print(games_df.info())
print("\nEstadísticas descriptivas de games_df:")
print(games_df.describe())
print("\nValores nulos en games_df:")
print(games_df.isnull().sum())


Primeras 5 filas de games_df:
   app_id                              title date_release   win    mac  linux  \
0   13500  Prince of Persia: Warrior Within™   2008-11-21  True  False  False   
1   22364            BRINK: Agents of Change   2011-08-03  True  False  False   
2  113020       Monaco: What's Yours Is Mine   2013-04-24  True   True   True   
3  226560                 Escape Dead Island   2014-11-18  True  False  False   
4  249050            Dungeon of the ENDLESS™   2014-10-27  True   True  False   

          rating  positive_ratio  user_reviews  price_final  price_original  \
0  Very Positive              84          2199         9.99            9.99   
1       Positive              85            21         2.99            2.99   
2  Very Positive              92          3722        14.99           14.99   
3          Mixed              61           873        14.99           14.99   
4  Very Positive              88          8784        11.99           11.99   

   disc

### Análisis inicial de users

In [13]:
print("\nPrimeras 5 filas de users_df:")
print(users_df.head())
print("\nInformación del dataset de usuarios:")
print(users_df.info())
print("\nEstadísticas descriptivas de users_df:")
print(users_df.describe())
print("\nValores nulos en users_df:")
print(users_df.isnull().sum())


Primeras 5 filas de users_df:
    user_id  products  reviews
0   7360263       359        0
1  14020781       156        1
2   8762579       329        4
3   4820647       176        4
4   5167327        98        2

Información del dataset de usuarios:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14306064 entries, 0 to 14306063
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   products  int64
 2   reviews   int64
dtypes: int64(3)
memory usage: 327.4 MB
None

Estadísticas descriptivas de users_df:
            user_id      products       reviews
count  1.430606e+07  1.430606e+07  1.430606e+07
mean   7.153032e+06  1.163734e+02  2.876738e+00
std    4.129805e+06  2.438515e+02  7.987421e+00
min    0.000000e+00  0.000000e+00  0.000000e+00
25%    3.576516e+06  2.300000e+01  1.000000e+00
50%    7.153032e+06  5.500000e+01  1.000000e+00
75%    1.072955e+07  1.270000e+02  3.000000e+00
max    1.430606e+07  3.221400e+04  6.045000e+03

Valores 

### Análisis inicial de recommendations

In [14]:
print("\nPrimeras 5 filas de recommendations_df:")
print(recommendations_df.head())
print("\nInformación del dataset de recomendaciones:")
print(recommendations_df.info())
print("\nEstadísticas descriptivas de recommendations_df:")
print(recommendations_df.describe())
print("\nValores nulos en recommendations_df:")
print(recommendations_df.isnull().sum())


Primeras 5 filas de recommendations_df:
    app_id  helpful  funny        date  is_recommended  hours  user_id  \
0   975370        0      0  2022-12-12            True   36.3    51580   
1   304390        4      0  2017-02-17           False   11.5     2586   
2  1085660        2      0  2019-11-17            True  336.5   253880   
3   703080        0      0  2022-09-23            True   27.4   259432   
4   526870        0      0  2021-01-10            True    7.9    23869   

   review_id  
0          0  
1          1  
2          2  
3          3  
4          4  

Información del dataset de recomendaciones:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41154794 entries, 0 to 41154793
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7  

### Análisis expecífico de features importantes

In [15]:
# Análisis de ratings
print("\nDistribución de ratings en juegos:")
print(games_df['rating'].value_counts())


Distribución de ratings en juegos:
rating
Positive                   13502
Very Positive              13139
Mixed                      12157
Mostly Positive             8738
Mostly Negative             1849
Overwhelmingly Positive     1110
Negative                     303
Very Negative                 60
Overwhelmingly Negative       14
Name: count, dtype: int64


In [16]:
# Análisis de precios
print("\nEstadísticas de precios:")
print(games_df[['price_final', 'price_original']].describe())


Estadísticas de precios:
        price_final  price_original
count  50872.000000    50872.000000
mean       8.620325        8.726788
std       11.514164       11.507021
min        0.000000        0.000000
25%        0.990000        0.990000
50%        4.990000        4.990000
75%       10.990000       11.990000
max      299.990000      299.990000


In [17]:
# Análisis de recomendaciones
print("\nDistribución de recomendaciones:")
print(recommendations_df['is_recommended'].value_counts(normalize=True))


Distribución de recomendaciones:
is_recommended
True     0.857844
False    0.142156
Name: proportion, dtype: float64


In [18]:
# Análisis de horas jugadas
print("\nEstadísticas de horas jugadas:")
print(recommendations_df['hours'].describe())


Estadísticas de horas jugadas:
count    4.115479e+07
mean     1.006022e+02
std      1.761675e+02
min      0.000000e+00
25%      7.800000e+00
50%      2.730000e+01
75%      9.920000e+01
max      1.000000e+03
Name: hours, dtype: float64


### Verificamos la integridad de referencias entre los datasets
Nos aseguramos de que los juegos y los usuarios referenciados en reseñas existen en el dataset de juegos y en el de usuarios

In [19]:
# Verificar si todos los app_id en recommendations existen en games
games_apps = set(games_df['app_id'])
recommendations_apps = set(recommendations_df['app_id'])
print("\nJuegos en recommendations pero no en games:", len(recommendations_apps - games_apps))


Juegos en recommendations pero no en games: 0


In [20]:
# Verificar si todos los user_id en recommendations existen en users
users_ids = set(users_df['user_id'])
recommendations_users = set(recommendations_df['user_id'])
print("Usuarios en recommendations pero no en users:", len(recommendations_users - users_ids))

Usuarios en recommendations pero no en users: 0


## Se va a crear un modelo capaz de predecir el rating que tendrá un juego ("Very Positive", "Mixed", etc.) 
## Son 9 clases 
- Positive                   13502
- Very Positive              13139
- Mixed                      12157
- Mostly Positive             8738
- Mostly Negative             1849
- Overwhelmingly Positive     1110
- Negative                     303
- Very Negative                 60
- Overwhelmingly Negative       14

## Features de entrada
- Precio
- Plataformas soportadas (win, mac, linux)
- Tiempo en el mercado
- Descuentos
- Métricas agregadas de reviews (horas promedio jugadas, ratio de recomendaciones)

## Preprocesamiendo de los datos