In [None]:
# Bibliotecas utilizadas
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [23]:
# Carrega o primeiro dataset
file_path1 = 'games_data0.xlsx'
games_df0 = pd.read_excel(file_path1, sheet_name='Games')

# Carrega o segundo dataset
file_path2 = 'games_data.xlsx'
games_df = pd.read_excel(file_path2, sheet_name='Games')

# Dá merge nos datasets carregados
merged_games_df = pd.concat([games_df0, games_df], ignore_index=True)

# Salvar o dataset mesclado
merged_games_df.to_excel('merged_games_data.xlsx', index=False, sheet_name='Games')

In [24]:
# Selecionando features e target
target = 'popularity'
categorical_features = ['category', 'genres', 'game_modes', 'platforms', 'player_perspectives', 'themes']
numerical_features = ['total_rating', 'total_rating_count']

# Remover linhas com valores ausentes em features numéricas
games_cleaned = merged_games_df.dropna(subset=numerical_features)

# Dividindo dados em features (X) e target (y)
X = games_cleaned[categorical_features + numerical_features]
y = games_cleaned[target]

# Dividindo em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pré-processamento: OneHotEncoding para categóricos, StandardScaler para numéricos
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [25]:
# Pipeline para Regressão Linear
linear_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', LinearRegression())])

# Pipeline para Árvore de Decisão
tree_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', DecisionTreeRegressor(random_state=24))])

# Pipeline para Random Forest
random_forest_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('model', RandomForestRegressor(random_state=24, n_estimators=100))])

In [26]:
# Treinando Regressão Linear
linear_pipeline.fit(X_train, y_train)
y_pred_linear = linear_pipeline.predict(X_test)

# Treinando Árvore de Decisão
tree_pipeline.fit(X_train, y_train)
y_pred_tree = tree_pipeline.predict(X_test)

# Treinando o modelo Random Forest
random_forest_pipeline.fit(X_train, y_train)
y_pred_forest = random_forest_pipeline.predict(X_test)

# Avaliando os modelos
linear_r2 = r2_score(y_test, y_pred_linear)
linear_rmse = mean_squared_error(y_test, y_pred_linear, squared=False)

tree_r2 = r2_score(y_test, y_pred_tree)
tree_rmse = mean_squared_error(y_test, y_pred_tree, squared=False)

forest_r2 = r2_score(y_test, y_pred_forest)
forest_rmse = mean_squared_error(y_test, y_pred_forest, squared=False)

# Exibindo resultados
print("Regressão Linear:")
print(f"R²: {linear_r2:.4f}, RMSE: {linear_rmse:.4f}")

print("\nÁrvore de Decisão:")
print(f"R²: {tree_r2:.4f}, RMSE: {tree_rmse:.4f}")

print("\nRandom Forest:")
print(f"R²: {forest_r2:.4f}, RMSE: {forest_rmse:.4f}")


Regressão Linear:
R²: 0.4764, RMSE: 0.0001

Árvore de Decisão:
R²: 0.7708, RMSE: 0.0000

Random Forest:
R²: 0.8408, RMSE: 0.0000




In [28]:
# Acessando o modelo Random Forest dentro do pipeline
rf_model = random_forest_pipeline.named_steps['model']

# Obtendo a importância das variáveis
importances = rf_model.feature_importances_

# Obtendo os nomes das variáveis (categóricas e numéricas)
# Categóricas: OneHotEncoder cria colunas extras, então precisamos de todos os nomes das colunas
cat_columns = random_forest_pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features)

# Combinando as variáveis numéricas e categóricas
features = numerical_features + list(cat_columns)

# Criando um DataFrame para as importâncias
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})

# Ordenando as variáveis pela importância
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Visualizando as 10 variáveis mais importantes
print("\nVariáveis Mais Importantes para a Popularidade (Random Forest):")
print(importance_df.head(10))


Top 10 Variáveis Mais Importantes para a Popularidade (Random Forest):
                                               Feature  Importance
1                                   total_rating_count    0.756502
0                                         total_rating    0.028771
423                         themes_Warfare, Historical    0.016395
324             themes_Action, Science fiction, Horror    0.015572
298                   player_perspectives_Third person    0.015108
200  platforms_console, operating_system, console, ...    0.011553
181  platforms_computer, operating_system, operatin...    0.009767
166  game_modes_Single player, Multiplayer, Co-oper...    0.009471
87                genres_Role-playing (RPG), Adventure    0.005433
165              game_modes_Single player, Multiplayer    0.004963
