In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Getting the train dataset 
X = pd.read_csv('D:/Archivos Personales/Courses/Data Science/Projects and Competitions/Video Game Sales/vgsales.csv')

# Remove any rows that may have NaN values in NA_Sales column.
X.dropna(axis=0, subset=['NA_Sales'], inplace=True)

# Assigning the target column to Y from X
Y = X.NA_Sales

# Splitting the dataset into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)

In [97]:
# Review how many NaN values are per columns, to see if dropping a column can be an option.
nanValuesPerColumn = X.isnull().sum()
nanValuesPerColumn

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [None]:
# Get the categorical values in the dataset and see their cardinality
categorical_columns = list(X_train_full.select_dtypes(include=['object']).columns)
X_train_full[categorical_columns].nunique()

Name         9677
Platform       30
Genre          12
Publisher     522
dtype: int64

In [None]:
# Posible opción para names
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=50, stop_words='english')
name_features = vectorizer.fit_transform(df['Name'].fillna(''))

# Convertir a DataFrame para poder unirlo con el original
import pandas as pd
name_df = pd.DataFrame(name_features.toarray(), columns=vectorizer.get_feature_names_out())

# Aplicar las 14 franquicias más conocidas y todas las otras other.

franquicias_top = [
    "Mario", "Tetris", "Pokemon", "Call of Duty", "Grand Theft Auto",
    "FIFA", "Wii", "The Sims", "Need for Speed", "Final Fantasy",
    "Lego", "Zelda", "Assassin's Creed", "Madden NFL", "Minecraft"
]

import re

# Unir las franquicias en un solo patrón regex
patron = '|'.join([re.escape(franq) for franq in franquicias_top])

df['franchise'] = (
    df['Name']
    .str.extract(f'({patron})', flags=re.IGNORECASE, expand=False)
    .fillna('Other')
)

# Normalizar nombre de franquicia (por si aparece en mayúsculas/minúsculas)
df['franchise'] = df['franchise'].str.title()










import re

# Agregar la columna 'franchise' con las franquicias conocidas
df['franchise'] = df['Name'].str.extract('(Mario|FIFA|Lego|Zelda|Final Fantasy|Pokemon|Castlevania|Gran Turismo|NBA|Sonic|Star Wars|Call of Duty)', 
                                         flags=re.IGNORECASE, expand=False)
df['franchise'] = df['franchise'].fillna('Other')  # Para juegos que no coinciden

# Convertir la columna 'franchise' a título (con la primera letra en mayúscula)
df['franchise'] = df['franchise'].str.title()

# Luego, podrías usar OneHotEncoding o Target Encoding sobre esta columna


In [None]:
# Analyze the quantity for each platform
X_train_full['Platform'].value_counts()

Platform
PS2     1758
DS      1730
Wii     1060
PS3     1053
X360    1026
PSP      962
PS       952
PC       775
XB       663
GBA      649
GC       448
3DS      410
PSV      320
PS4      264
N64      259
SNES     185
XOne     167
SAT      137
WiiU     113
2600     107
NES       80
GB        76
DC        40
GEN       22
NG        10
SCD        4
WS         4
3DO        2
PCFX       1
GG         1
Name: count, dtype: int64

In [None]:
# Get only the 14 most common platforms
X_train_full['Platform'].value_counts().nlargest(14).index

Index(['PS2', 'DS', 'Wii', 'PS3', 'X360', 'PSP', 'PS', 'PC', 'XB', 'GBA', 'GC',
       '3DS', 'PSV', 'PS4'],
      dtype='object', name='Platform')

In [None]:
# Get the 14 most common platforms and for all others assign 'Other' ending with a 15 cardinality.
topPlatforms = list(X_train_full['Platform'].value_counts().nlargest(14).index)
X_train_full['PlatformReduced'] = X_train_full['Platform'].apply(lambda x: x if x in topPlatforms else 'Other')
X_train_full['PlatformReduced'].value_counts()

PlatformReduced
PS2      1758
DS       1730
Other    1208
Wii      1060
PS3      1053
X360     1026
PSP       962
PS        952
PC        775
XB        663
GBA       649
GC        448
3DS       410
PSV       320
PS4       264
Name: count, dtype: int64

In [None]:
# Repeat the same as platform but for publisher
top_publishers = df['Publisher'].value_counts().nlargest(20).index
df['Publisher_reduced'] = df['Publisher'].apply(lambda x: x if x in top_publishers else 'Other')

In [42]:
X_train_full['Publisher'].value_counts()

Publisher
Electronic Arts                 1082
Activision                       788
Namco Bandai Games               748
Ubisoft                          735
Konami Digital Entertainment     671
                                ... 
Men-A-Vision                       1
Tetris Online                      1
fonfun                             1
FuRyu Corporation                  1
Yamasa Entertainment               1
Name: count, Length: 522, dtype: int64