# Lendo dados

In [34]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.decomposition import PCA

df = pd.read_csv("dataset/games.csv")

print(df.columns)

Index(['Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform',
       'Requires Special Device', 'Developer', 'Publisher', 'Release Year',
       'Genre', 'Multiplayer', 'Game Length (Hours)', 'Graphics Quality',
       'Soundtrack Quality', 'Story Quality', 'User Review Text', 'Game Mode',
       'Min Number of Players'],
      dtype='object')


# Pre processamento dos dados

## Limpeza dos dados

- Remover colunas irrelevantes

In [29]:
cols_to_remove = [
    'Graphics Quality',     
    'Soundtrack Quality',
    'Story Quality',
    'User Review Text',
    'Game Title', 
    'Developer', 
    'Publisher'
]

df.drop(inplace=True, columns=cols_to_remove)
df.head(8)

Unnamed: 0,User Rating,Age Group Targeted,Price,Platform,Requires Special Device,Release Year,Genre,Multiplayer,Game Length (Hours),Game Mode,Min Number of Players
0,36.4,All Ages,41.41,PC,No,2015,Adventure,No,55.3,Offline,1
1,38.3,Adults,57.56,PC,No,2015,Shooter,Yes,34.6,Offline,3
2,26.8,Teens,44.93,PC,Yes,2012,Adventure,Yes,13.9,Offline,5
3,38.4,All Ages,48.29,Mobile,Yes,2015,Sports,No,41.9,Online,4
4,30.1,Adults,55.49,PlayStation,Yes,2022,RPG,Yes,13.2,Offline,1
5,38.6,Adults,51.73,Xbox,No,2017,RPG,Yes,48.8,Offline,4
6,33.1,Adults,46.44,Mobile,No,2020,Simulation,No,36.9,Online,3
7,32.3,Teens,36.92,Nintendo Switch,No,2012,Strategy,No,52.1,Offline,3


- Remover colunas irrelevantes

In [30]:
before = len(df)
df.dropna(inplace=True)
after = len(df)

print(f"Linhas antes: {before} | depois: {after}")

df.head(8)

Linhas antes: 47774 | depois: 47774


Unnamed: 0,User Rating,Age Group Targeted,Price,Platform,Requires Special Device,Release Year,Genre,Multiplayer,Game Length (Hours),Game Mode,Min Number of Players
0,36.4,All Ages,41.41,PC,No,2015,Adventure,No,55.3,Offline,1
1,38.3,Adults,57.56,PC,No,2015,Shooter,Yes,34.6,Offline,3
2,26.8,Teens,44.93,PC,Yes,2012,Adventure,Yes,13.9,Offline,5
3,38.4,All Ages,48.29,Mobile,Yes,2015,Sports,No,41.9,Online,4
4,30.1,Adults,55.49,PlayStation,Yes,2022,RPG,Yes,13.2,Offline,1
5,38.6,Adults,51.73,Xbox,No,2017,RPG,Yes,48.8,Offline,4
6,33.1,Adults,46.44,Mobile,No,2020,Simulation,No,36.9,Online,3
7,32.3,Teens,36.92,Nintendo Switch,No,2012,Strategy,No,52.1,Offline,3


- Remover linhas duplicadas

In [31]:
before = len(df)
df.drop_duplicates(inplace=True)
afeter = len(df)

print(f"Linhas antes: {before} | depois: {after}")
df.head(8)

Linhas antes: 47774 | depois: 47774


Unnamed: 0,User Rating,Age Group Targeted,Price,Platform,Requires Special Device,Release Year,Genre,Multiplayer,Game Length (Hours),Game Mode,Min Number of Players
0,36.4,All Ages,41.41,PC,No,2015,Adventure,No,55.3,Offline,1
1,38.3,Adults,57.56,PC,No,2015,Shooter,Yes,34.6,Offline,3
2,26.8,Teens,44.93,PC,Yes,2012,Adventure,Yes,13.9,Offline,5
3,38.4,All Ages,48.29,Mobile,Yes,2015,Sports,No,41.9,Online,4
4,30.1,Adults,55.49,PlayStation,Yes,2022,RPG,Yes,13.2,Offline,1
5,38.6,Adults,51.73,Xbox,No,2017,RPG,Yes,48.8,Offline,4
6,33.1,Adults,46.44,Mobile,No,2020,Simulation,No,36.9,Online,3
7,32.3,Teens,36.92,Nintendo Switch,No,2012,Strategy,No,52.1,Offline,3


## Transformação dos dados 


In [32]:
# Convertendo coluna para o tipo datetime
df['Release Year'] = pd.to_datetime(df['Release Year'], format='%Y', errors='coerce').dt.year

# lista com os nomes de todas as colunas numéricas do DataFrame
numeric_features = df.select_dtypes(include=['number']).columns.tolist()
# lista com os nomes de todas as colunas categóricas do DataFrame
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Novo DataFrame contendo apenas as colunas numéricas
df_numeric = df[numeric_features]

# Objeto para padronizar para cada coluna ter média 0 e desvio padrão 1.
scaler = StandardScaler()

# Cria um novo DataFrame escalonado aplicando o StandardScaler
df_numeric_scaled = pd.DataFrame(scaler.fit_transform(df_numeric), 
                                 columns=numeric_features, 
                                 index=df.index)
# Novo DataFrame com as colunas categóricas
df_categorical = df[categorical_features]

# Transforma as variáveis categóricas em variáveis numéricas usando one-hot encoding
df_categorical_encoded = pd.get_dummies(df_categorical)

# Cria um DataFrame temporário contendo apenas as colunas que não são nem numéricas nem categóricas
df_temp = df.drop(columns=numeric_features + categorical_features)

# Junta tudo em um único DataFrame
df_processed = pd.concat([df_temp, df_numeric_scaled, df_categorical_encoded], axis=1)

df_processed.head(8)

Unnamed: 0,User Rating,Price,Release Year,Game Length (Hours),Min Number of Players,Age Group Targeted_Adults,Age Group Targeted_All Ages,Age Group Targeted_Kids,Age Group Targeted_Teens,Platform_Mobile,...,Genre_Puzzle,Genre_RPG,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy,Multiplayer_No,Multiplayer_Yes,Game Mode_Offline,Game Mode_Online
0,0.884851,0.126615,-0.367734,1.437616,-1.486467,False,True,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,1.136505,1.528497,-0.367734,0.13346,-0.764313,True,False,False,False,False,...,False,False,True,False,False,False,False,True,True,False
2,-0.386663,0.432164,-1.112662,-1.170695,-0.042159,False,False,False,True,False,...,False,False,False,False,False,False,False,True,True,False
3,1.14975,0.723825,-0.367734,0.59338,-0.403236,False,True,False,False,True,...,False,False,False,False,True,False,True,False,False,True
4,0.05042,1.348813,1.370431,-1.214797,-1.486467,True,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
5,1.17624,1.022431,0.128884,1.028098,-0.403236,True,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
6,0.447768,0.563238,0.873813,0.278366,-0.764313,True,False,False,False,True,...,False,False,False,True,False,False,True,False,False,True
7,0.341808,-0.263135,-1.112662,1.236007,-0.764313,False,False,False,True,False,...,False,False,False,False,False,True,True,False,True,False


## Redução de dimensionalidade

In [37]:
# Criando objeto PCA
pca = PCA(n_components=2)

# Aplicando PCA ao DataFrame
df_pca = pca.fit_transform(df_processed)

print("Variância explicada por cada componente:")
print(pca.explained_variance_ratio_)

# Soma os valores para ver o total de informação que se manteve
total_variancia = pca.explained_variance_ratio_.sum()
print(f"\nVariância total explicada pelos 2 componentes: {total_variancia:.2%}")

# DataFrame final com o PCA aplicado
df_pca_final = pd.DataFrame(data=df_pca, 
                            columns=['Componente Principal 1', 'Componente Principal 2'],
                            index=df.index)

print("-" * 30)
print(f"Formato depois do PCA: {df_pca_final.shape}")
print("\n5 primeiras linhas do DataFrame com PCA:")
df_pca_final.head(8)

Variância explicada por cada componente:
[0.2218657  0.11311449]

Variância total explicada pelos 2 componentes: 33.50%
------------------------------
Formato depois do PCA: (47774, 2)

5 primeiras linhas do DataFrame com PCA:


Unnamed: 0,Componente Principal 1,Componente Principal 2
0,1.342991,-1.053754
1,1.697451,-0.932647
2,-0.558121,-1.048326
3,1.481816,-0.528728
4,0.223336,-0.313872
5,1.850386,-0.135485
6,0.744463,0.098864
7,0.654731,-1.105572
