# Deep Learning Model
## Luis Arturo
### A01703572

# Conexión a GPU local

In [33]:
import setuptools
import tensorflow as tf
import pandas as pd
import os

In [34]:
def verify_gpu_setup():
    """Comprehensive verification of TensorFlow GPU setup"""
    # Print TensorFlow version
    print(f"TensorFlow version: {tf.__version__}")
    
    # Check CUDA availability
    print("\nCUDA available:", tf.test.is_built_with_cuda())
    
    # List all available GPUs
    physical_devices = tf.config.list_physical_devices()
    print("\nAvailable devices:")
    for device in physical_devices:
        print(f"  {device.device_type}: {device.name}")
    
    # Check if GPU is available and perform a simple computation
    if tf.test.is_built_with_cuda():
        try:
            # Create some random data
            with tf.device('/GPU:0'):
                a = tf.random.normal([1000, 1000])
                b = tf.random.normal([1000, 1000])
                # Perform matrix multiplication
                c = tf.matmul(a, b)
                # Force execution
                result = c.numpy()
                print("\n✅ Successfully performed computation on GPU!")
                print(f"Matrix multiplication shape: {result.shape}")
        except Exception as e:
            print("\n❌ Error during GPU computation:")
            print(e)
    else:
        print("\n❌ CUDA is not available in this TensorFlow installation")

    # Print GPU memory info
    try:
        gpu = tf.config.list_physical_devices('GPU')[0]
        print("\nGPU Device:", gpu)
    except IndexError:
        print("\n❌ No GPU devices found")

verify_gpu_setup()

TensorFlow version: 2.18.0

CUDA available: True

Available devices:
  CPU: /physical_device:CPU:0

✅ Successfully performed computation on GPU!
Matrix multiplication shape: (1000, 1000)

❌ No GPU devices found


In [35]:
# Set TF logging level to avoid unnecessary messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# ETL

## Carga de datos

In [36]:
# Configuración para mostrar todas las columnas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [37]:
# 1. EXTRACT - Carga de datos
print("=== CARGANDO DATASETS ===")
games_df = pd.read_csv('data/games.csv')
users_df = pd.read_csv('data/users.csv')
recommendations_df = pd.read_csv('data/recommendations.csv')

print("=== DATASETS CARGADOS ===")

=== CARGANDO DATASETS ===
=== DATASETS CARGADOS ===


## Información general del dataset

In [38]:
print("\nDimensiones del games_df:", games_df.shape)
print("\nColumnas disponibles del games_df:")
for col in games_df.columns:
    print(f"- {col}")


Dimensiones del games_df: (50872, 13)

Columnas disponibles del games_df:
- app_id
- title
- date_release
- win
- mac
- linux
- rating
- positive_ratio
- user_reviews
- price_final
- price_original
- discount
- steam_deck


In [39]:
print("\nDimensiones del users_df:", users_df.shape)
print("\nColumnas disponibles del users_df:")
for col in users_df.columns:
    print(f"- {col}")


Dimensiones del users_df: (14306064, 3)

Columnas disponibles del users_df:
- user_id
- products
- reviews


In [40]:
print("\nDimensiones del recommendations_df:", recommendations_df.shape)
print("\nColumnas disponibles del recommendations_df:")
for col in recommendations_df.columns:
    print(f"- {col}")


Dimensiones del recommendations_df: (41154794, 8)

Columnas disponibles del recommendations_df:
- app_id
- helpful
- funny
- date
- is_recommended
- hours
- user_id
- review_id


### Análisis inicial de games

In [41]:
print("\nPrimeras 5 filas de games_df:")
print(games_df.head())
print("\nInformación del dataset de juegos:")
print(games_df.info())
print("\nEstadísticas descriptivas de games_df:")
print(games_df.describe())
print("\nValores nulos en games_df:")
print(games_df.isnull().sum())


Primeras 5 filas de games_df:
   app_id                              title date_release   win    mac  linux  \
0   13500  Prince of Persia: Warrior Within™   2008-11-21  True  False  False   
1   22364            BRINK: Agents of Change   2011-08-03  True  False  False   
2  113020       Monaco: What's Yours Is Mine   2013-04-24  True   True   True   
3  226560                 Escape Dead Island   2014-11-18  True  False  False   
4  249050            Dungeon of the ENDLESS™   2014-10-27  True   True  False   

          rating  positive_ratio  user_reviews  price_final  price_original  \
0  Very Positive              84          2199         9.99            9.99   
1       Positive              85            21         2.99            2.99   
2  Very Positive              92          3722        14.99           14.99   
3          Mixed              61           873        14.99           14.99   
4  Very Positive              88          8784        11.99           11.99   

   disc

### Análisis inicial de users

In [42]:
print("\nPrimeras 5 filas de users_df:")
print(users_df.head())
print("\nInformación del dataset de usuarios:")
print(users_df.info())
print("\nEstadísticas descriptivas de users_df:")
print(users_df.describe())
print("\nValores nulos en users_df:")
print(users_df.isnull().sum())


Primeras 5 filas de users_df:
    user_id  products  reviews
0   7360263       359        0
1  14020781       156        1
2   8762579       329        4
3   4820647       176        4
4   5167327        98        2

Información del dataset de usuarios:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14306064 entries, 0 to 14306063
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   products  int64
 2   reviews   int64
dtypes: int64(3)
memory usage: 327.4 MB
None

Estadísticas descriptivas de users_df:
            user_id      products       reviews
count  1.430606e+07  1.430606e+07  1.430606e+07
mean   7.153032e+06  1.163734e+02  2.876738e+00
std    4.129805e+06  2.438515e+02  7.987421e+00
min    0.000000e+00  0.000000e+00  0.000000e+00
25%    3.576516e+06  2.300000e+01  1.000000e+00
50%    7.153032e+06  5.500000e+01  1.000000e+00
75%    1.072955e+07  1.270000e+02  3.000000e+00
max    1.430606e+07  3.221400e+04  6.045000e+03

Valores 

### Análisis inicial de recommendations

In [43]:
print("\nPrimeras 5 filas de recommendations_df:")
print(recommendations_df.head())
print("\nInformación del dataset de recomendaciones:")
print(recommendations_df.info())
print("\nEstadísticas descriptivas de recommendations_df:")
print(recommendations_df.describe())
print("\nValores nulos en recommendations_df:")
print(recommendations_df.isnull().sum())


Primeras 5 filas de recommendations_df:
    app_id  helpful  funny        date  is_recommended  hours  user_id  \
0   975370        0      0  2022-12-12            True   36.3    51580   
1   304390        4      0  2017-02-17           False   11.5     2586   
2  1085660        2      0  2019-11-17            True  336.5   253880   
3   703080        0      0  2022-09-23            True   27.4   259432   
4   526870        0      0  2021-01-10            True    7.9    23869   

   review_id  
0          0  
1          1  
2          2  
3          3  
4          4  

Información del dataset de recomendaciones:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41154794 entries, 0 to 41154793
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7  

### Análisis expecífico de features importantes

In [44]:
# Análisis de ratings
print("\nDistribución de ratings en juegos:")
print(games_df['rating'].value_counts())


Distribución de ratings en juegos:
rating
Positive                   13502
Very Positive              13139
Mixed                      12157
Mostly Positive             8738
Mostly Negative             1849
Overwhelmingly Positive     1110
Negative                     303
Very Negative                 60
Overwhelmingly Negative       14
Name: count, dtype: int64


In [45]:
# Análisis de precios
print("\nEstadísticas de precios:")
print(games_df[['price_final', 'price_original']].describe())


Estadísticas de precios:
        price_final  price_original
count  50872.000000    50872.000000
mean       8.620325        8.726788
std       11.514164       11.507021
min        0.000000        0.000000
25%        0.990000        0.990000
50%        4.990000        4.990000
75%       10.990000       11.990000
max      299.990000      299.990000


In [46]:
# Análisis de recomendaciones
print("\nDistribución de recomendaciones:")
print(recommendations_df['is_recommended'].value_counts(normalize=True))


Distribución de recomendaciones:
is_recommended
True     0.857844
False    0.142156
Name: proportion, dtype: float64


In [47]:
# Análisis de horas jugadas
print("\nEstadísticas de horas jugadas:")
print(recommendations_df['hours'].describe())


Estadísticas de horas jugadas:
count    4.115479e+07
mean     1.006022e+02
std      1.761675e+02
min      0.000000e+00
25%      7.800000e+00
50%      2.730000e+01
75%      9.920000e+01
max      1.000000e+03
Name: hours, dtype: float64


### Verificamos la integridad de referencias entre los datasets
Nos aseguramos de que los juegos y los usuarios referenciados en reseñas existen en el dataset de juegos y en el de usuarios

In [48]:
# Verificar si todos los app_id en recommendations existen en games
games_apps = set(games_df['app_id'])
recommendations_apps = set(recommendations_df['app_id'])
print("\nJuegos en recommendations pero no en games:", len(recommendations_apps - games_apps))


Juegos en recommendations pero no en games: 0


In [49]:
# Verificar si todos los user_id en recommendations existen en users
users_ids = set(users_df['user_id'])
recommendations_users = set(recommendations_df['user_id'])
print("Usuarios en recommendations pero no en users:", len(recommendations_users - users_ids))

Usuarios en recommendations pero no en users: 0


## Se va a crear un modelo capaz de predecir el rating que tendrá un juego ("Very Positive", "Mixed", etc.) 
## Son 9 clases 
- Positive                   13502
- Very Positive              13139
- Mixed                      12157
- Mostly Positive             8738
- Mostly Negative             1849
- Overwhelmingly Positive     1110
- Negative                     303
- Very Negative                 60
- Overwhelmingly Negative       14

## Features de entrada
- Precio
- Plataformas soportadas (win, mac, linux)
- Tiempo en el mercado
- Descuentos
- Métricas agregadas de reviews (horas promedio jugadas, ratio de recomendaciones)

## Preprocesamiendo de los datos

In [50]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

### Creamos features en base a las recomendaciones de los juegos

In [51]:
game_metrics = recommendations_df.groupby('app_id').agg({
    'hours': ['mean', 'median', 'std'],
    'is_recommended': 'mean',
    'helpful': 'mean',
    'funny': 'mean'
}).reset_index()

# Aplanar los nombres de las columnas
game_metrics.columns = ['app_id', 'avg_hours', 'median_hours', 'std_hours', 
                        'recommendation_ratio', 'avg_helpful', 'avg_funny']

### Procesamos las fechas para calcular el tiempo que el juego ha estado en el mercado

In [52]:
games_df['date_release'] = pd.to_datetime(games_df['date_release'])
reference_date = pd.to_datetime('2024-11-01')  # Fecha de referencia
games_df['days_in_market'] = (reference_date - games_df['date_release']).dt.days

### Unimos los nuevos features en un dataset final usando como base el de games

In [53]:
final_df = games_df.merge(game_metrics, on='app_id', how='left')
    
# 4. Rellenar valores nulos
final_df = final_df.fillna({
    'avg_hours': 0,
    'median_hours': 0,
    'std_hours': 0,
    'recommendation_ratio': 0.5,
    'avg_helpful': 0,
    'avg_funny': 0
})

In [54]:
final_df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,5824,18.967912,12.9,41.722269,0.845789,5.392052,0.6121
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,4839,0.0,0.0,0.0,0.5,0.0,0.0
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,4209,20.413294,6.9,53.295053,0.908541,1.434682,0.450382
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,3636,10.776625,8.1,15.421851,0.625998,6.778791,0.815279
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True,3658,40.621691,23.4,54.860085,0.885567,2.530928,0.698351


In [55]:
final_df.describe()

Unnamed: 0,app_id,date_release,positive_ratio,user_reviews,price_final,price_original,discount,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
count,50872.0,50872,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0,50872.0
mean,1055224.0,2019-03-13 03:53:57.112753664,77.052033,1824.425,8.620325,8.726788,5.592212,2059.837533,12.017546,5.630656,21.209406,0.697721,2.927259,0.496619
min,10.0,1997-06-30 00:00:00,0.0,10.0,0.0,0.0,0.0,374.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,528737.5,2017-03-21 00:00:00,67.0,19.0,0.99,0.99,0.0,1137.0,0.0,0.0,0.0,0.5,0.0,0.0
50%,986085.0,2019-08-23 00:00:00,81.0,49.0,4.99,4.99,0.0,1897.0,3.436826,1.5,4.186043,0.714286,2.178243,0.131972
75%,1524895.0,2021-09-21 00:00:00,91.0,206.0,10.99,11.99,0.0,2782.0,10.730804,5.0,21.601544,0.881546,4.0,0.482362
max,2599300.0,2023-10-24 00:00:00,100.0,7494460.0,299.99,299.99,90.0,9986.0,487.0,511.5,495.120177,1.0,268.285714,266.285714
std,610324.9,,18.253592,40073.52,11.514164,11.507021,18.606679,1125.133076,28.005183,16.127001,41.136308,0.204218,4.689921,2.481174


In [56]:
final_df.max()

app_id                              2599300
title                      🧠 OUT OF THE BOX
date_release            2023-10-24 00:00:00
win                                    True
mac                                    True
linux                                  True
rating                        Very Positive
positive_ratio                          100
user_reviews                        7494460
price_final                          299.99
price_original                       299.99
discount                               90.0
steam_deck                             True
days_in_market                         9986
avg_hours                             487.0
median_hours                          511.5
std_hours                        495.120177
recommendation_ratio                    1.0
avg_helpful                      268.285714
avg_funny                        266.285714
dtype: object

### Se encontró un juego que posee métricas exageradas, al compararlo con Steam nos dimos cuenta de que parece ser un error
En base a esto imprimimos todos los juegos que tienen un precion final mayor a 70

In [73]:
juegos_filtrados = final_df[final_df['price_final'] > 70]

In [74]:
juegos_filtrados

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
81,438450,3DF Zephyr Lite Steam Edition,2016-02-02,True,False,False,Very Positive,96,58,199.99,199.99,0.0,True,3195,243.356250,121.35,265.982864,1.000000,4.979167,0.750000
144,537770,Gal*Gun: Double Peace - 'Pheromone Z' Item,2016-10-20,True,False,False,Positive,91,12,89.99,89.99,0.0,True,2934,0.000000,0.00,0.000000,0.500000,0.000000,0.000000
190,554820,VideoPad Video Editor,2016-12-01,True,True,False,Very Positive,80,51,99.99,99.99,0.0,True,2892,221.103030,130.70,238.796225,0.787879,3.212121,0.242424
403,1076160,Command: Modern Operations,2019-11-14,True,False,False,Very Positive,84,897,79.99,79.99,0.0,True,1814,126.815050,49.00,186.176124,0.859532,12.749164,5.050167
675,1182920,Movavi Video Editor Plus 2020 - Video Editing ...,2019-11-25,True,True,False,Very Positive,80,874,74.99,74.99,0.0,True,1803,111.442762,56.80,161.577714,0.788419,1.817372,0.227171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47761,411893,DCS: F-14A/B Tomcat,2019-03-13,True,False,False,Very Positive,92,555,79.99,79.99,0.0,True,2060,0.000000,0.00,0.000000,0.500000,0.000000,0.000000
48350,1096900,RPG Maker MZ,2020-08-27,True,True,False,Very Positive,83,1070,80.00,0.00,0.0,True,1527,186.038028,72.85,249.288267,0.791080,16.969484,3.521127
49163,2199970,Substance 3D Painter 2023,2023-01-23,True,True,True,Very Positive,85,148,150.00,0.00,0.0,True,648,0.000000,0.00,0.000000,0.500000,0.000000,0.000000
49306,2070990,VEGAS Edit 20 Steam Edition,2022-11-01,True,False,False,Mixed,66,21,129.48,249.00,48.0,True,731,68.566667,5.60,113.331475,0.666667,19.333333,5.333333


Al ser solo 150 juegos, decidimos tirarlos para evitar conflictor

In [75]:
final_df = final_df[final_df['price_final'] <= 70].reset_index(drop=True)


In [77]:
final_df = final_df[final_df['price_original'] <= 70].reset_index(drop=True)


In [78]:
final_df.max()

app_id                               2585630
title                   🟥🟧🟨[SUDOKOLORFUL]🟩🟦🟪
date_release             2023-10-24 00:00:00
win                                     True
mac                                     True
linux                                   True
rating                         Very Positive
positive_ratio                           100
user_reviews                         7494460
price_final                             70.0
price_original                         69.99
discount                                90.0
steam_deck                              True
days_in_market                          9986
avg_hours                              487.0
median_hours                           511.5
std_hours                         495.120177
recommendation_ratio                     1.0
avg_helpful                       268.285714
avg_funny                         266.285714
dtype: object

### Parece ser que hay juegos con nombres repetidos, eliminamos aquellos que esten repetidos ya que solo son 121 títulos problemáticos

In [80]:
titulos_duplicados = final_df['title'].duplicated().sum()
print(f"Número de títulos duplicados: {titulos_duplicados}")
titulos_repetidos = final_df[final_df['title'].duplicated(keep=False)]['title'].unique()
print(f"Títulos duplicados: {titulos_repetidos}")

Número de títulos duplicados: 121
Títulos duplicados: ['Prison Wars' 'The Good Life' 'RUSH' 'Hide and Seek' 'Remnants'
 'Blade of Darkness' 'The Cleaner' 'Rogue' 'Lighthouse Keeper' 'Valor'
 'Last Stop' 'fishy' 'Flashback' 'The Hotel' '3D Organon Anatomy'
 'Minotaur' 'Northern Lights' 'First Snow' 'Momentum' 'Alter Ego'
 'The Backrooms' 'Locked Up' 'Get Stuffed!' 'Ascent' 'The Hunt' 'Hatch'
 'Lost' 'Warhammer Quest' 'Apollo 11 VR' 'Psych' 'Eternal Return'
 'Achievement Clicker' 'Call of Duty®' 'Causality' 'The Lost Village'
 'Resonance' 'Dead Forest' 'Lost Marbles' 'Wanderer' 'Castles' 'Grapple'
 'Chaos Theory' 'Dungeon Warriors' 'Bounce' 'Evolution' 'Zombie Survivors'
 'Ritual' 'Archery Simulator' 'Outpost' 'Shutter' 'The Line' 'The Village'
 'The Wanderer' 'Fantasy Gladiators' 'Cave Crawler' 'Cursed'
 'A Walk in the Woods' 'Dark Matter' 'Beyond the Wall' 'White Mirror'
 'Zombie Apocalypse' 'RIFT' 'STAY' 'Silent World' 'REALITY' 'Tomorrow'
 'Arachnophobia' 'Vaccine' 'Arena' 'Dog Adven

In [81]:
titulos_unicos = ~final_df['title'].duplicated(keep=False)

# Filtrar el DataFrame para conservar solo los títulos únicos
final_df = final_df[titulos_unicos].reset_index(drop=True)
final_df.describe()

Unnamed: 0,app_id,date_release,positive_ratio,user_reviews,price_final,price_original,discount,days_in_market,avg_hours,median_hours,std_hours,recommendation_ratio,avg_helpful,avg_funny
count,50468.0,50468,50468.0,50468.0,50468.0,50468.0,50468.0,50468.0,50468.0,50468.0,50468.0,50468.0,50468.0,50468.0
mean,1053170.0,2019-03-10 16:29:02.220813312,77.107296,1827.623,8.334205,8.418794,5.594,2062.313169,11.962724,5.609404,21.156393,0.69811,2.921866,0.494643
min,10.0,1997-06-30 00:00:00,0.0,10.0,0.0,0.0,0.0,374.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,527935.0,2017-03-17 00:00:00,67.0,19.0,0.99,0.99,0.0,1141.0,0.0,0.0,0.0,0.5,0.0,0.0
50%,984050.0,2019-08-20 00:00:00,81.0,49.0,4.99,4.99,0.0,1900.0,3.46122,1.5,4.203444,0.716667,2.176471,0.131432
75%,1521585.0,2021-09-17 00:00:00,91.0,208.0,10.99,11.99,0.0,2786.0,10.755992,5.0,21.65572,0.881944,4.0,0.48
max,2585630.0,2023-10-24 00:00:00,100.0,7494460.0,70.0,69.99,90.0,9986.0,487.0,511.5,495.120177,1.0,268.285714,266.285714
std,609465.9,,18.217463,40187.71,9.882409,9.686676,18.600401,1125.298733,27.763131,16.034161,40.895701,0.204011,4.688555,2.486822


In [82]:
titulos_duplicados = final_df['title'].duplicated().sum()
print(f"Número de títulos duplicados: {titulos_duplicados}")
titulos_repetidos = final_df[final_df['title'].duplicated(keep=False)]['title'].unique()
print(f"Títulos duplicados: {titulos_repetidos}")

Número de títulos duplicados: 0
Títulos duplicados: []


### Definimos las features que se usarán para el modelo

In [83]:
features = [
    'price_final', 'price_original', 'discount',
    'win', 'mac', 'linux', 'steam_deck',
    'days_in_market',
    'avg_hours', 'median_hours', 'std_hours',
    'recommendation_ratio', 'avg_helpful', 'avg_funny'
]

### Por los valores tan diferentes que tenemos en el dataset, normalizamos los datos para que puedan ser usados por el modelo sin problemas y no haya valores extremos que hagan que los valores menores no tengan importancia

In [84]:
scaler = StandardScaler()
numeric_features = [
    'price_final', 'price_original', 'discount',
    'days_in_market', 'avg_hours', 'median_hours', 'std_hours',
    'recommendation_ratio', 'avg_helpful', 'avg_funny'
]
final_df[numeric_features] = scaler.fit_transform(final_df[numeric_features])

### Pasamos la compatibilidad con diferentes plataformas a que sean booleanas, 0 o 1

In [85]:
bool_features = ['win', 'mac', 'linux', 'steam_deck']
final_df[bool_features] = final_df[bool_features].astype(int)

### Preparamos la variable dependiente (y)

In [88]:
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(final_df['rating'])
y = to_categorical(y)  # One-hot encoding para clasificación multiclase

### Preparamos las features (x)

In [89]:
X = final_df[features].values

# Embeddings

### Limpiamos los títulos

In [90]:
def clean_title(title):
    """
    Limpia y normaliza el título del juego.
    
    Args:
        title (str): Título original del juego
        
    Returns:
        str: Título limpio y normalizado
    """
    # Convertir a minúsculas
    title = title.lower()
    
    # Eliminar caracteres especiales y símbolos, manteniendo espacios
    title = re.sub(r'[^\w\s]', ' ', title)
    
    # Eliminar espacios múltiples
    title = ' '.join(title.split())
    
    return title

final_df['clean_title'] = final_df['title'].apply(clean_title)

### Creamos y configuramos el tokenizer

In [92]:
vocab_size = 10000  # Tamaño máximo del vocabulario
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(final_df['clean_title'])

### Convertimos los títulos a secuencias numéricas

In [93]:
title_sequences = tokenizer.texts_to_sequences(final_df['clean_title'])

### Determinamos el padding según la secuencia máxima que poseemos

In [95]:
title_lengths = [len(x) for x in title_sequences]
max_length = max(title_lengths)
avg_length = sum(title_lengths) / len(title_lengths)

print(f"\nEstadísticas de longitud de títulos:")
print(f"- Longitud máxima: {max_length} palabras")
print(f"- Longitud promedio: {avg_length:.2f} palabras")


Estadísticas de longitud de títulos:
- Longitud máxima: 28 palabras
- Longitud promedio: 3.56 palabras
