# Deep Learning Model
## Luis Arturo
### A01703572

# Conexión a GPU local

In [119]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import re
import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [120]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Test a simple GPU operation if available
if device.type == "cuda":
    try:
        # Create some random data on the GPU
        a = torch.randn(1000, 1000, device=device)
        b = torch.randn(1000, 1000, device=device)
        c = torch.matmul(a, b)  # Perform matrix multiplication
        print("✅ Successfully performed computation on GPU!")
        print(f"Matrix multiplication result shape: {c.shape}")
    except Exception as e:
        print("❌ Error during GPU computation:", e)
else:
    print("CUDA is not available. Running on CPU.")

Using device: cuda
✅ Successfully performed computation on GPU!
Matrix multiplication result shape: torch.Size([1000, 1000])


# ETL

## Carga de datos

In [121]:
# Configuración para mostrar todas las columnas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [122]:
# 1. EXTRACT - Carga de datos
print("=== CARGANDO DATASETS ===")
games_df = pd.read_csv('data/games.csv')
users_df = pd.read_csv('data/users.csv')
recommendations_df = pd.read_csv('data/recommendations.csv')

print("=== DATASETS CARGADOS ===")

=== CARGANDO DATASETS ===
=== DATASETS CARGADOS ===


## Información general del dataset

In [123]:
print("\nDimensiones del games_df:", games_df.shape)
print("\nColumnas disponibles del games_df:")
for col in games_df.columns:
    print(f"- {col}")


Dimensiones del games_df: (50872, 13)

Columnas disponibles del games_df:
- app_id
- title
- date_release
- win
- mac
- linux
- rating
- positive_ratio
- user_reviews
- price_final
- price_original
- discount
- steam_deck


In [124]:
print("\nDimensiones del users_df:", users_df.shape)
print("\nColumnas disponibles del users_df:")
for col in users_df.columns:
    print(f"- {col}")


Dimensiones del users_df: (14306064, 3)

Columnas disponibles del users_df:
- user_id
- products
- reviews


In [125]:
print("\nDimensiones del recommendations_df:", recommendations_df.shape)
print("\nColumnas disponibles del recommendations_df:")
for col in recommendations_df.columns:
    print(f"- {col}")


Dimensiones del recommendations_df: (41154794, 8)

Columnas disponibles del recommendations_df:
- app_id
- helpful
- funny
- date
- is_recommended
- hours
- user_id
- review_id


### Análisis inicial de games

In [126]:
print("\nPrimeras 5 filas de games_df:")
print(games_df.head())
print("\nInformación del dataset de juegos:")
print(games_df.info())
print("\nEstadísticas descriptivas de games_df:")
print(games_df.describe())
print("\nValores nulos en games_df:")
print(games_df.isnull().sum())


Primeras 5 filas de games_df:
   app_id                              title date_release   win    mac  linux  \
0   13500  Prince of Persia: Warrior Within™   2008-11-21  True  False  False   
1   22364            BRINK: Agents of Change   2011-08-03  True  False  False   
2  113020       Monaco: What's Yours Is Mine   2013-04-24  True   True   True   
3  226560                 Escape Dead Island   2014-11-18  True  False  False   
4  249050            Dungeon of the ENDLESS™   2014-10-27  True   True  False   

          rating  positive_ratio  user_reviews  price_final  price_original  \
0  Very Positive              84          2199         9.99            9.99   
1       Positive              85            21         2.99            2.99   
2  Very Positive              92          3722        14.99           14.99   
3          Mixed              61           873        14.99           14.99   
4  Very Positive              88          8784        11.99           11.99   

   disc

### Análisis inicial de users

In [127]:
print("\nPrimeras 5 filas de users_df:")
print(users_df.head())
print("\nInformación del dataset de usuarios:")
print(users_df.info())
print("\nEstadísticas descriptivas de users_df:")
print(users_df.describe())
print("\nValores nulos en users_df:")
print(users_df.isnull().sum())


Primeras 5 filas de users_df:
    user_id  products  reviews
0   7360263       359        0
1  14020781       156        1
2   8762579       329        4
3   4820647       176        4
4   5167327        98        2

Información del dataset de usuarios:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14306064 entries, 0 to 14306063
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   products  int64
 2   reviews   int64
dtypes: int64(3)
memory usage: 327.4 MB
None

Estadísticas descriptivas de users_df:
            user_id      products       reviews
count  1.430606e+07  1.430606e+07  1.430606e+07
mean   7.153032e+06  1.163734e+02  2.876738e+00
std    4.129805e+06  2.438515e+02  7.987421e+00
min    0.000000e+00  0.000000e+00  0.000000e+00
25%    3.576516e+06  2.300000e+01  1.000000e+00
50%    7.153032e+06  5.500000e+01  1.000000e+00
75%    1.072955e+07  1.270000e+02  3.000000e+00
max    1.430606e+07  3.221400e+04  6.045000e+03

Valores 

### Análisis inicial de recommendations

In [128]:
print("\nPrimeras 5 filas de recommendations_df:")
print(recommendations_df.head())
print("\nInformación del dataset de recomendaciones:")
print(recommendations_df.info())
print("\nEstadísticas descriptivas de recommendations_df:")
print(recommendations_df.describe())
print("\nValores nulos en recommendations_df:")
print(recommendations_df.isnull().sum())


Primeras 5 filas de recommendations_df:
    app_id  helpful  funny        date  is_recommended  hours  user_id  \
0   975370        0      0  2022-12-12            True   36.3    51580   
1   304390        4      0  2017-02-17           False   11.5     2586   
2  1085660        2      0  2019-11-17            True  336.5   253880   
3   703080        0      0  2022-09-23            True   27.4   259432   
4   526870        0      0  2021-01-10            True    7.9    23869   

   review_id  
0          0  
1          1  
2          2  
3          3  
4          4  

Información del dataset de recomendaciones:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41154794 entries, 0 to 41154793
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7  

### Análisis expecífico de features importantes

In [129]:
# Análisis de ratings
print("\nDistribución de ratings en juegos:")
print(games_df['rating'].value_counts())


Distribución de ratings en juegos:
rating
Positive                   13502
Very Positive              13139
Mixed                      12157
Mostly Positive             8738
Mostly Negative             1849
Overwhelmingly Positive     1110
Negative                     303
Very Negative                 60
Overwhelmingly Negative       14
Name: count, dtype: int64


In [130]:
# Análisis de precios
print("\nEstadísticas de precios:")
print(games_df[['price_final', 'price_original']].describe())


Estadísticas de precios:
        price_final  price_original
count  50872.000000    50872.000000
mean       8.620325        8.726788
std       11.514164       11.507021
min        0.000000        0.000000
25%        0.990000        0.990000
50%        4.990000        4.990000
75%       10.990000       11.990000
max      299.990000      299.990000


In [131]:
# Análisis de recomendaciones
print("\nDistribución de recomendaciones:")
print(recommendations_df['is_recommended'].value_counts(normalize=True))


Distribución de recomendaciones:
is_recommended
True     0.857844
False    0.142156
Name: proportion, dtype: float64


In [132]:
# Análisis de horas jugadas
print("\nEstadísticas de horas jugadas:")
print(recommendations_df['hours'].describe())


Estadísticas de horas jugadas:
count    4.115479e+07
mean     1.006022e+02
std      1.761675e+02
min      0.000000e+00
25%      7.800000e+00
50%      2.730000e+01
75%      9.920000e+01
max      1.000000e+03
Name: hours, dtype: float64


### Verificamos la integridad de referencias entre los datasets
Nos aseguramos de que los juegos y los usuarios referenciados en reseñas existen en el dataset de juegos y en el de usuarios

In [133]:
# Verificar si todos los app_id en recommendations existen en games
games_apps = set(games_df['app_id'])
recommendations_apps = set(recommendations_df['app_id'])
print("\nJuegos en recommendations pero no en games:", len(recommendations_apps - games_apps))


Juegos en recommendations pero no en games: 0


In [134]:
# Verificar si todos los user_id en recommendations existen en users
users_ids = set(users_df['user_id'])
recommendations_users = set(recommendations_df['user_id'])
print("Usuarios en recommendations pero no en users:", len(recommendations_users - users_ids))

Usuarios en recommendations pero no en users: 0


## Se va a crear un modelo capaz de predecir el rating que tendrá un juego ("Very Positive", "Mixed", etc.) 
## Son 9 clases 
- Positive                   13502
- Very Positive              13139
- Mixed                      12157
- Mostly Positive             8738
- Mostly Negative             1849
- Overwhelmingly Positive     1110
- Negative                     303
- Very Negative                 60
- Overwhelmingly Negative       14

## Features de entrada
- Precio
- Plataformas soportadas (win, mac, linux)
- Tiempo en el mercado
- Descuentos
- Métricas agregadas de reviews (horas promedio jugadas, ratio de recomendaciones)

## Preprocesamiendo de los datos

### Creamos features en base a las recomendaciones de los juegos

In [135]:
game_metrics = recommendations_df.groupby('app_id').agg({
    'hours': ['mean', 'median', 'std'],
    'is_recommended': 'mean',
    'helpful': 'mean',
    'funny': 'mean'
}).reset_index()

# Aplanar los nombres de las columnas
game_metrics.columns = ['app_id', 'avg_hours', 'median_hours', 'std_hours', 
                        'recommendation_ratio', 'avg_helpful', 'avg_funny']

### Procesamos las fechas para calcular el tiempo que el juego ha estado en el mercado

In [136]:
games_df['date_release'] = pd.to_datetime(games_df['date_release'])
reference_date = pd.to_datetime('2024-11-01')  # Fecha de referencia
games_df['days_in_market'] = (reference_date - games_df['date_release']).dt.days

### Unimos los nuevos features en un dataset final usando como base el de games

In [137]:
final_df = games_df.merge(game_metrics, on='app_id', how='left')
    
# 4. Rellenar valores nulos
final_df = final_df.fillna({
    'avg_hours': 0,
    'median_hours': 0,
    'std_hours': 0,
    'recommendation_ratio': 0.5,
    'avg_helpful': 0,
    'avg_funny': 0
})

In [138]:
final_df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,5824,18.967912,12.9,41.722269,0.845789,5.392052,0.6121
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,4839,0.0,0.0,0.0,0.5,0.0,0.0
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,4209,20.413294,6.9,53.295053,0.908541,1.434682,0.450382
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,3636,10.776625,8.1,15.421851,0.625998,6.778791,0.815279
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True,3658,40.621691,23.4,54.860085,0.885567,2.530928,0.698351


In [139]:
final_df.describe()

Unnamed: 0,app_id,date_release,positive_ratio,user_reviews,price_final,price_original,discount,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
count,50872.0,50872,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0
mean,1055224.0,2019-03-13 03:53:57.112753664,77.052033,1824.425,8.620325,8.726788,5.592212,2059.837533,12.017546,5.630656,21.209406,0.697721,2.927259,0.496619
min,10.0,1997-06-30 00:00:00,0.0,10.0,0.0,0.0,0.0,374.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,528737.5,2017-03-21 00:00:00,67.0,19.0,0.99,0.99,0.0,1137.0,0.0,0.0,0.0,0.5,0.0,0.0
50%,986085.0,2019-08-23 00:00:00,81.0,49.0,4.99,4.99,0.0,1897.0,3.436826,1.5,4.186043,0.714286,2.178243,0.131972
75%,1524895.0,2021-09-21 00:00:00,91.0,206.0,10.99,11.99,0.0,2782.0,10.730804,5.0,21.601544,0.881546,4.0,0.482362
max,2599300.0,2023-10-24 00:00:00,100.0,7494460.0,299.99,299.99,90.0,9986.0,487.0,511.5,495.120177,1.0,268.285714,266.285714
std,610324.9,,18.253592,40073.52,11.514164,11.507021,18.606679,1125.133076,28.005183,16.127001,41.136308,0.204218,4.689921,2.481174


In [140]:
final_df.max()

app_id                              2599300
title                      🧠 OUT OF THE BOX
date_release            2023-10-24 00:00:00
win                                    True
mac                                    True
linux                                  True
rating                        Very Positive
positive_ratio                          100
user_reviews                        7494460
price_final                          299.99
price_original                       299.99
discount                               90.0
steam_deck                             True
days_in_market                         9986
avg_hours                             487.0
median_hours                          511.5
std_hours                        495.120177
recommendation_ratio                    1.0
avg_helpful                      268.285714
avg_funny                        266.285714
dtype: object

### Se encontró un juego que posee métricas exageradas, al compararlo con Steam nos dimos cuenta de que parece ser un error
En base a esto imprimimos todos los juegos que tienen un precion final mayor a 70

In [141]:
juegos_filtrados = final_df[final_df['price_final'] > 70]

In [142]:
juegos_filtrados

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
81,438450,3DF Zephyr Lite Steam Edition,2016-02-02,True,False,False,Very Positive,96,58,199.99,199.99,0.0,True,3195,243.356250,121.35,265.982864,1.000000,4.979167,0.750000
144,537770,Gal*Gun: Double Peace - 'Pheromone Z' Item,2016-10-20,True,False,False,Positive,91,12,89.99,89.99,0.0,True,2934,0.000000,0.00,0.000000,0.500000,0.000000,0.000000
190,554820,VideoPad Video Editor,2016-12-01,True,True,False,Very Positive,80,51,99.99,99.99,0.0,True,2892,221.103030,130.70,238.796225,0.787879,3.212121,0.242424
403,1076160,Command: Modern Operations,2019-11-14,True,False,False,Very Positive,84,897,79.99,79.99,0.0,True,1814,126.815050,49.00,186.176124,0.859532,12.749164,5.050167
675,1182920,Movavi Video Editor Plus 2020 - Video Editing ...,2019-11-25,True,True,False,Very Positive,80,874,74.99,74.99,0.0,True,1803,111.442762,56.80,161.577714,0.788419,1.817372,0.227171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47765,411893,DCS: F-14A/B Tomcat,2019-03-13,True,False,False,Very Positive,92,555,79.99,79.99,0.0,True,2060,0.000000,0.00,0.000000,0.500000,0.000000,0.000000
48354,1096900,RPG Maker MZ,2020-08-27,True,True,False,Very Positive,83,1070,80.00,0.00,0.0,True,1527,186.038028,72.85,249.288267,0.791080,16.969484,3.521127
49167,2199970,Substance 3D Painter 2023,2023-01-23,True,True,True,Very Positive,85,148,150.00,0.00,0.0,True,648,0.000000,0.00,0.000000,0.500000,0.000000,0.000000
49310,2070990,VEGAS Edit 20 Steam Edition,2022-11-01,True,False,False,Mixed,66,21,129.48,249.00,48.0,True,731,68.566667,5.60,113.331475,0.666667,19.333333,5.333333


Al ser solo 150 juegos, decidimos tirarlos para evitar conflictor

In [143]:
final_df = final_df[final_df['price_final'] <= 70].reset_index(drop=True)


In [144]:
final_df = final_df[final_df['price_original'] <= 70].reset_index(drop=True)


In [145]:
final_df.max()

app_id                              2599300
title                      🧠 OUT OF THE BOX
date_release            2023-10-24 00:00:00
win                                    True
mac                                    True
linux                                  True
rating                        Very Positive
positive_ratio                          100
user_reviews                        7494460
price_final                            70.0
price_original                        69.99
discount                               90.0
steam_deck                             True
days_in_market                         9986
avg_hours                             487.0
median_hours                          511.5
std_hours                        495.120177
recommendation_ratio                    1.0
avg_helpful                      268.285714
avg_funny                        266.285714
dtype: object

### Parece ser que hay juegos con nombres repetidos, eliminamos aquellos que esten repetidos ya que solo son 121 títulos problemáticos

In [146]:
titulos_duplicados = final_df['title'].duplicated().sum()
print(f"Número de títulos duplicados: {titulos_duplicados}")
titulos_repetidos = final_df[final_df['title'].duplicated(keep=False)]['title'].unique()
print(f"Títulos duplicados: {titulos_repetidos}")

Número de títulos duplicados: 121
Títulos duplicados: ['Prison Wars' 'The Good Life' 'RUSH' 'Hide and Seek' 'Remnants'
 'Blade of Darkness' 'The Cleaner' 'Rogue' 'Lighthouse Keeper' 'Valor'
 'Last Stop' 'fishy' 'Flashback' 'The Hotel' '3D Organon Anatomy'
 'Minotaur' 'Northern Lights' 'First Snow' 'Momentum' 'Alter Ego'
 'The Backrooms' 'Locked Up' 'Get Stuffed!' 'Ascent' 'The Hunt' 'Hatch'
 'Lost' 'Warhammer Quest' 'Apollo 11 VR' 'Psych' 'Eternal Return'
 'Achievement Clicker' 'Call of Duty®' 'Causality' 'The Lost Village'
 'Resonance' 'Dead Forest' 'Lost Marbles' 'Wanderer' 'Castles' 'Grapple'
 'Chaos Theory' 'Dungeon Warriors' 'Bounce' 'Evolution' 'Zombie Survivors'
 'Ritual' 'Archery Simulator' 'Outpost' 'Shutter' 'The Line' 'The Village'
 'The Wanderer' 'Fantasy Gladiators' 'Cave Crawler' 'Cursed'
 'A Walk in the Woods' 'Dark Matter' 'Beyond the Wall' 'White Mirror'
 'Zombie Apocalypse' 'RIFT' 'STAY' 'Silent World' 'REALITY' 'Tomorrow'
 'Arachnophobia' 'Vaccine' 'Arena' 'Dog Adven

In [147]:
titulos_unicos = ~final_df['title'].duplicated(keep=False)

# Filtrar el DataFrame para conservar solo los títulos únicos
final_df = final_df[titulos_unicos].reset_index(drop=True)
final_df.describe()

Unnamed: 0,app_id,date_release,positive_ratio,user_reviews,price_final,price_original,discount,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
count,50472.0,50472,50472.0,50472.0,50472.0,50472.0,50472.0,50472.0,50472.0,50472.0,50472.0,50472.0,50472.0,50472.0
mean,1053253.0,2019-03-10 18:46:25.164051456,77.107149,1827.487,8.334297,8.418423,5.593557,2062.217764,11.961956,5.609096,21.154918,0.6981,2.921674,0.494605
min,10.0,1997-06-30 00:00:00,0.0,10.0,0.0,0.0,0.0,374.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,527947.5,2017-03-17 00:00:00,67.0,19.0,0.99,0.99,0.0,1141.0,0.0,0.0,0.0,0.5,0.0,0.0
50%,984095.0,2019-08-20 00:00:00,81.0,49.0,4.99,4.99,0.0,1900.0,3.460451,1.5,4.203173,0.716655,2.176318,0.131313
75%,1521705.0,2021-09-17 00:00:00,91.0,208.0,10.99,11.99,0.0,2786.0,10.753019,5.0,21.65414,0.881903,4.0,0.48
max,2599300.0,2023-10-24 00:00:00,100.0,7494460.0,70.0,69.99,90.0,9986.0,487.0,511.5,495.120177,1.0,268.285714,266.285714
std,609561.0,,18.217559,40186.12,9.882176,9.686554,18.599731,1125.329458,27.762187,16.033585,40.894435,0.204009,4.688425,2.486727


In [148]:
titulos_duplicados = final_df['title'].duplicated().sum()
print(f"Número de títulos duplicados: {titulos_duplicados}")
titulos_repetidos = final_df[final_df['title'].duplicated(keep=False)]['title'].unique()
print(f"Títulos duplicados: {titulos_repetidos}")

Número de títulos duplicados: 0
Títulos duplicados: []


### Definimos las features que se usarán para el modelo

In [149]:
features = [
    'price_final', 'price_original', 'discount',
    'win', 'mac', 'linux', 'steam_deck',
    'days_in_market',
    'avg_hours', 'median_hours', 'std_hours',
    'recommendation_ratio', 'avg_helpful', 'avg_funny'
]

### Por los valores tan diferentes que tenemos en el dataset, normalizamos los datos para que puedan ser usados por el modelo sin problemas y no haya valores extremos que hagan que los valores menores no tengan importancia

In [150]:
scaler = StandardScaler()
numeric_features = [
    'price_final', 'price_original', 'discount',
    'days_in_market', 'avg_hours', 'median_hours', 'std_hours',
    'recommendation_ratio', 'avg_helpful', 'avg_funny'
]
final_df[numeric_features] = scaler.fit_transform(final_df[numeric_features])

### Pasamos la compatibilidad con diferentes plataformas a que sean booleanas, 0 o 1

In [151]:
bool_features = ['win', 'mac', 'linux', 'steam_deck']
final_df[bool_features] = final_df[bool_features].astype(int)

### Preparamos la variable dependiente (y)

In [152]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(final_df['rating'])
y = label_encoder.fit_transform(final_df['rating'])

### Preparamos las features (x)

In [153]:
X = final_df[features].values

# Embeddings

### Limpiamos los títulos

In [154]:
import torch
from torch.nn.utils.rnn import pad_sequence

In [155]:
def clean_title(title):
    """
    Limpia y normaliza el título del juego.
    
    Args:
        title (str): Título original del juego
        
    Returns:
        str: Título limpio y normalizado
    """
    # Convertir a minúsculas
    title = title.lower()
    
    # Eliminar caracteres especiales y símbolos, manteniendo espacios
    title = re.sub(r'[^\w\s]', ' ', title)
    
    # Eliminar espacios múltiples
    title = ' '.join(title.split())
    
    return title

# Aplicar la limpieza a los títulos
final_df['clean_title'] = final_df['title'].apply(clean_title)

### Creamos y configuramos el tokenizer (TorchText no me funciona)

In [156]:
class SimpleTokenizer:
    def __init__(self, num_words=10000, oov_token='<OOV>'):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index = {}
        self.index_word = {}
        self.word_counts = {}
        
    def fit_on_texts(self, texts):
        # Contar todas las palabras
        for text in texts:
            for word in text.split():
                self.word_counts[word] = self.word_counts.get(word, 0) + 1
        
        # Ordenar por frecuencia y tomar los num_words más frecuentes
        sorted_words = sorted(self.word_counts.items(), 
                            key=lambda x: x[1], 
                            reverse=True)[:self.num_words-1]
        
        # Crear mappings
        self.word_index = {self.oov_token: 0}
        self.index_word = {0: self.oov_token}
        
        for idx, (word, _) in enumerate(sorted_words, 1):
            self.word_index[word] = idx
            self.index_word[idx] = word
    
    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = []
            for word in text.split():
                sequence.append(self.word_index.get(word, 0))  # 0 es OOV
            sequences.append(sequence)
        return sequences

### Función para hacer padding

In [157]:
def pad_sequences(sequences, max_len=None, padding_value=0):
    """
    Hace padding de las secuencias a una longitud máxima
    """
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_len:
            padded_sequences.append(seq[:max_len])
        else:
            padded_sequences.append(seq + [padding_value] * (max_len - len(seq)))
    
    return torch.tensor(padded_sequences)

### Aplicamos la tokenización y el padding

In [158]:
# Aplicar la tokenización y padding
tokenizer = SimpleTokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(final_df['clean_title'])
sequences = tokenizer.texts_to_sequences(final_df['clean_title'])
padded_sequences = pad_sequences(sequences, max_len=10)  # Usamos max_len=10 como en tu código original

### Ejemplo de procesamiento

In [159]:
# Veamos algunos stats
print(f"Forma de las secuencias padding: {padded_sequences.shape}")
vocab_size = len(tokenizer.word_index)
print(f"Tamaño del vocabulario: {vocab_size}")

# Veamos un ejemplo
example_idx = 0
print("\nEjemplo de tokenización:")
print(f"Texto original: {final_df['clean_title'].iloc[example_idx]}")
print(f"Secuencia tokenizada: {sequences[example_idx]}")
print(f"Secuencia con padding: {padded_sequences[example_idx].tolist()}")

# Guardamos algunas variables importantes para el modelo
max_length = padded_sequences.shape[1]  # Longitud máxima de las secuencias
embedding_dim = 32  # Dimensión del embedding (igual que en tu código original)

print(f"\nVariables importantes:")
print(f"Máxima longitud de secuencia: {max_length}")
print(f"Dimensión del embedding: {embedding_dim}")
print(f"Tamaño del vocabulario: {vocab_size}")

Forma de las secuencias padding: torch.Size([50472, 10])
Tamaño del vocabulario: 10000

Ejemplo de tokenización:
Texto original: prince of persia warrior within
Secuencia tokenizada: [1275, 2, 2533, 243, 657]
Secuencia con padding: [1275, 2, 2533, 243, 657, 0, 0, 0, 0, 0]

Variables importantes:
Máxima longitud de secuencia: 10
Dimensión del embedding: 32
Tamaño del vocabulario: 10000


In [160]:
final_df

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny,clean_title
0,13500,Prince of Persia: Warrior Within™,2008-11-21,1,0,0,Very Positive,84,2199,0.167546,0.162245,-0.300736,1,3.342861,0.252359,0.454731,0.502943,0.723937,0.526915,0.047249,prince of persia warrior within
1,22364,BRINK: Agents of Change,2011-08-03,1,0,0,Positive,85,21,-0.540807,-0.560414,-0.300736,1,2.467553,-0.430877,-0.349838,-0.517311,-0.971047,-0.623174,-0.198900,brink agents of change
2,113020,Monaco: What's Yours Is Mine,2013-04-24,1,1,1,Very Positive,92,3722,0.673512,0.678429,-0.300736,1,1.907711,0.304422,0.080513,0.785937,1.031535,-0.317165,-0.017784,monaco what s yours is mine
3,226560,Escape Dead Island,2014-11-18,1,0,0,Mixed,61,873,0.673512,0.678429,-0.300736,1,1.398522,-0.042696,0.155357,-0.140193,-0.353433,0.822697,0.128956,escape dead island
4,249050,Dungeon of the ENDLESS™,2014-10-27,1,1,0,Very Positive,88,8784,0.369933,0.368719,-0.300736,1,1.418072,1.032340,1.109613,0.824208,0.918921,-0.083344,0.081934,dungeon of the endless
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50467,2296380,I Expect You To Die 3: Cog in the Machine,2023-09-28,1,0,0,Very Positive,96,101,1.382877,-0.869092,-0.300736,1,-1.477109,-0.430877,-0.349838,-0.517311,-0.971047,-0.623174,-0.198900,i expect you to die 3 cog in the machine
50468,1272080,PAYDAY 3,2023-09-21,1,0,0,Mostly Negative,38,29458,3.204357,-0.869092,-0.300736,1,-1.470889,-0.430877,-0.349838,-0.517311,-0.971047,-0.623174,-0.198900,payday 3
50469,1402110,Eternights,2023-09-11,1,0,0,Very Positive,89,1128,2.192424,-0.869092,-0.300736,1,-1.462002,-0.430877,-0.349838,-0.517311,-0.971047,-0.623174,-0.198900,eternights
50470,2272250,Forgive Me Father 2,2023-10-19,1,0,0,Very Positive,95,82,0.876911,-0.869092,-0.300736,1,-1.495770,-0.430877,-0.349838,-0.517311,-0.971047,-0.623174,-0.198900,forgive me father 2


# Modelo 1

### Separamos los datos en Train 60%, Validation 20% y Test 20%

In [161]:
# Dividir en train/validation/test (60%/20%/20%)
X_temp, X_test, y_temp, y_test, seq_temp, seq_test = train_test_split(
    X, y, padded_sequences, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val, seq_train, seq_val = train_test_split(
    X_temp, y_temp, seq_temp, test_size=0.25, random_state=42
)

print(f"Dimensiones de los datos:")
print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")

Dimensiones de los datos:
X_train: (30282, 14)
X_val: (10095, 14)
X_test: (10095, 14)


### Convertimos a tensores de PyTorch

In [162]:
# Convertimos a tensores de PyTorch
X_train = torch.FloatTensor(X_train)
X_val = torch.FloatTensor(X_val)
X_test = torch.FloatTensor(X_test)

y_train = torch.LongTensor(y_train)
y_val = torch.LongTensor(y_val)
y_test = torch.LongTensor(y_test)

seq_train = torch.LongTensor(seq_train)
seq_val = torch.LongTensor(seq_val)
seq_test = torch.LongTensor(seq_test)

### Definimos el dataset personalizado

In [163]:
class GameDataset(Dataset):
    def __init__(self, features, sequences, labels):
        self.features = features
        self.sequences = sequences
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'sequence': self.sequences[idx],
            'label': self.labels[idx]
        }

### Creamos los dataloaders

In [164]:
batch_size = 32
train_dataset = GameDataset(X_train, seq_train, y_train)
val_dataset = GameDataset(X_val, seq_val, y_val)
test_dataset = GameDataset(X_test, seq_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### Definimos el modelo

In [165]:
class GameRatingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_length, num_features, num_classes):
        super().__init__()
        
        # Embedding para los títulos
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Calculamos el tamaño del embedding aplanado
        self.flat_embed_size = max_length * embedding_dim
        
        # Capa para features numéricas
        self.numeric_layer = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )
        
        # Capas combinadas después de la concatenación
        self.combined_layers = nn.Sequential(
            nn.Linear(self.flat_embed_size + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(128),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(64),
            
            nn.Linear(64, num_classes)
        )
        
    def forward(self, sequences, features):
        # Procesar títulos
        embedded = self.embedding(sequences)
        flat_embedded = embedded.view(embedded.size(0), -1)
        
        # Procesar features numéricas
        numeric_out = self.numeric_layer(features)
        
        # Combinar ambos caminos
        combined = torch.cat((flat_embedded, numeric_out), dim=1)
        
        # Capas finales
        return self.combined_layers(combined)

### Inicializamos el modelo

In [166]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GameRatingModel(
    vocab_size=vocab_size,  # Del tokenizer
    embedding_dim=32,
    max_length=10,  # Longitud máxima de las secuencias
    num_features=X_train.shape[1],
    num_classes=len(label_encoder.classes_)
).to(device)

### Configuramos el entrenamiento

In [167]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

### Función de entrenamiento


In [168]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    best_model_state = None
    # best_val_loss = float('inf')
    # patience = 3
    # patience_counter = 0
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} - Training'):
            features = batch['features'].to(device)
            sequences = batch['sequence'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(sequences, features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()
            
        train_loss = train_loss/len(train_loader)
        train_acc = 100.*train_correct/train_total
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                features = batch['features'].to(device)
                sequences = batch['sequence'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(sequences, features)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
                
        val_loss = val_loss/len(val_loader)
        val_acc = 100.*val_correct/val_total
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%')
        print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%')
        
        # # Early stopping
        # if val_loss < best_val_loss:
        #     best_val_loss = val_loss
        #     best_model_state = model.state_dict().copy()
        #     patience_counter = 0
        # else:
        #     patience_counter += 1
        #     if patience_counter >= patience:
        #         print('Early stopping triggered')
        #         break
    
    # Restore best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model

### Entrenando el modelo

In [169]:
print("Iniciando entrenamiento...")
model = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=10
)

Iniciando entrenamiento...


Epoch 1/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 629.90it/s]


Epoch 1/10:
Train Loss: 1.3522 | Train Acc: 48.47%
Val Loss: 1.1420 | Val Acc: 54.02%


Epoch 2/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 677.01it/s]


Epoch 2/10:
Train Loss: 1.1382 | Train Acc: 54.30%
Val Loss: 1.1174 | Val Acc: 55.29%


Epoch 3/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 702.87it/s]


Epoch 3/10:
Train Loss: 1.1161 | Train Acc: 55.60%
Val Loss: 1.0983 | Val Acc: 55.98%


Epoch 4/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 693.47it/s]


Epoch 4/10:
Train Loss: 1.0872 | Train Acc: 56.63%
Val Loss: 1.1032 | Val Acc: 54.80%


Epoch 5/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 699.11it/s]


Epoch 5/10:
Train Loss: 1.0656 | Train Acc: 57.79%
Val Loss: 1.0987 | Val Acc: 55.96%


Epoch 6/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 576.64it/s]


Epoch 6/10:
Train Loss: 1.0410 | Train Acc: 58.99%
Val Loss: 1.0726 | Val Acc: 56.92%


Epoch 7/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 577.95it/s]


Epoch 7/10:
Train Loss: 1.0212 | Train Acc: 60.04%
Val Loss: 1.1083 | Val Acc: 55.58%


Epoch 8/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 701.94it/s]


Epoch 8/10:
Train Loss: 1.0001 | Train Acc: 60.70%
Val Loss: 1.0915 | Val Acc: 56.64%


Epoch 9/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 691.66it/s]


Epoch 9/10:
Train Loss: 0.9821 | Train Acc: 62.07%
Val Loss: 1.0860 | Val Acc: 57.31%


Epoch 10/10 - Training: 100%|██████████| 947/947 [00:01<00:00, 746.96it/s]


Epoch 10/10:
Train Loss: 0.9509 | Train Acc: 62.88%
Val Loss: 1.0922 | Val Acc: 57.23%


### Evaluamos el modelo

In [171]:
model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for batch in test_loader:
        features = batch['features'].to(device)
        sequences = batch['sequence'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(sequences, features)
        _, predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()

test_acc = 100.*test_correct/test_total
print(f'\nTest Accuracy: {test_acc:.2f}%')


Test Accuracy: 58.14%


In [174]:
torch.save(model.state_dict(), 'models/first-model.pth')