In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data_dir = "../data/"
df = pd.read_csv(data_dir+"books_details.csv")

In [3]:
# Contar a frequência das categorias
contagem_categorias = df['categoria'].value_counts()

# Definir um limite (ex: categorias que aparecem menos de 10 vezes)
limite_frequencia = 10
categorias_a_manter = contagem_categorias[contagem_categorias >= limite_frequencia].index

# Substituir categorias raras por 'Outras'
df['categoria_agrupada'] = np.where(
    df['categoria'].isin(categorias_a_manter),
    df['categoria'],
    'Outras'
)

# Aplicar One-Hot Encoding na nova coluna agrupada
df_dummies = pd.get_dummies(df['categoria_agrupada'], prefix='cat', drop_first=True)

# Juntar as novas colunas ao DataFrame principal
df = pd.concat([df, df_dummies], axis=1)

In [4]:
features = ['preço','rating', 'disponibilidade','cat_Business', 'cat_Childrens',
       'cat_Classics', 'cat_Default', 'cat_Fantasy', 'cat_Fiction',
       'cat_Food and Drink', 'cat_Historical Fiction', 'cat_History',
       'cat_Horror', 'cat_Humor', 'cat_Music', 'cat_Mystery', 'cat_Nonfiction',
       'cat_Outras', 'cat_Philosophy', 'cat_Poetry', 'cat_Romance',
       'cat_Science', 'cat_Science Fiction', 'cat_Sequential Art',
       'cat_Thriller', 'cat_Travel', 'cat_Womens Fiction', 'cat_Young Adult']

In [5]:
df[features].corr()

Unnamed: 0,preço,rating,disponibilidade,cat_Business,cat_Childrens,cat_Classics,cat_Default,cat_Fantasy,cat_Fiction,cat_Food and Drink,...,cat_Philosophy,cat_Poetry,cat_Romance,cat_Science,cat_Science Fiction,cat_Sequential Art,cat_Thriller,cat_Travel,cat_Womens Fiction,cat_Young Adult
preço,1.0,0.028166,-0.010914,-0.019923,-0.029108,0.014215,-0.019869,0.070345,0.018192,-0.044524,...,-0.011045,0.008711,-0.014991,-0.016354,-0.011196,-0.009822,-0.026562,0.034504,0.015672,0.006266
rating,0.028166,1.0,0.016166,-0.000487,-0.036427,-0.043598,-0.025821,0.025102,0.048094,-0.00282,...,-0.041131,0.058541,-0.039095,0.000463,-0.059835,0.009993,-0.014392,-0.014392,0.017847,0.062184
disponibilidade,-0.010914,0.016166,1.0,0.048717,-0.021051,-0.140116,0.019752,-0.033174,0.021513,0.063737,...,-0.021094,0.093162,-0.030303,-0.022875,-0.017429,0.028298,0.041671,-0.00922,-0.075204,0.000321
cat_Business,-0.019923,-0.000487,0.048717,1.0,-0.019046,-0.015337,-0.046659,-0.024747,-0.029058,-0.019381,...,-0.011623,-0.015337,-0.020989,-0.013132,-0.014053,-0.031381,-0.011623,-0.011623,-0.014493,-0.026331
cat_Childrens,-0.029108,-0.036427,-0.021051,-0.019046,1.0,-0.024051,-0.073167,-0.038805,-0.045566,-0.030392,...,-0.018226,-0.024051,-0.032912,-0.020593,-0.022037,-0.04921,-0.018226,-0.018226,-0.022727,-0.04129
cat_Classics,0.014215,-0.043598,-0.140116,-0.015337,-0.024051,1.0,-0.05892,-0.03125,-0.036694,-0.024475,...,-0.014677,-0.019368,-0.026504,-0.016583,-0.017746,-0.039628,-0.014677,-0.014677,-0.018302,-0.03325
cat_Default,-0.019869,-0.025821,0.019752,-0.046659,-0.073167,-0.05892,1.0,-0.095066,-0.111628,-0.074456,...,-0.04465,-0.05892,-0.08063,-0.050449,-0.053987,-0.120555,-0.04465,-0.04465,-0.055676,-0.101152
cat_Fantasy,0.070345,0.025102,-0.033174,-0.024747,-0.038805,-0.03125,-0.095066,1.0,-0.059204,-0.039489,...,-0.023681,-0.03125,-0.042763,-0.026756,-0.028633,-0.063938,-0.023681,-0.023681,-0.029529,-0.053648
cat_Fiction,0.018192,0.048094,0.021513,-0.029058,-0.045566,-0.036694,-0.111628,-0.059204,1.0,-0.046369,...,-0.027807,-0.036694,-0.050214,-0.031418,-0.033621,-0.075078,-0.027807,-0.027807,-0.034674,-0.062994
cat_Food and Drink,-0.044524,-0.00282,0.063737,-0.019381,-0.030392,-0.024475,-0.074456,-0.039489,-0.046369,1.0,...,-0.018547,-0.024475,-0.033492,-0.020956,-0.022425,-0.050077,-0.018547,-0.018547,-0.023127,-0.042017


In [11]:
TARGET = 'preço'

FEATURES = [
    'rating', 'disponibilidade', 
    'cat_Business', 'cat_Childrens', 'cat_Classics', 'cat_Default', 'cat_Fantasy',
    'cat_Fiction', 'cat_Food and Drink', 'cat_Historical Fiction', 'cat_History',
    'cat_Horror', 'cat_Humor', 'cat_Music', 'cat_Mystery', 'cat_Nonfiction',
    'cat_Outras', 'cat_Philosophy', 'cat_Poetry', 'cat_Romance',
    'cat_Science', 'cat_Science Fiction', 'cat_Sequential Art',
    'cat_Thriller', 'cat_Travel', 'cat_Womens Fiction', 'cat_Young Adult'
]

In [12]:
df = df.dropna(subset=[TARGET])
df[TARGET] = pd.to_numeric(df[TARGET], errors='coerce').fillna(df[TARGET].mean())

In [13]:
for col in ['rating', 'disponibilidade']:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) # Trata nulos com 0


In [14]:
df_model = df[FEATURES + [TARGET]].copy()

# Remover linhas com valores faltantes após a conversão, se houver algum problema
df_model = df_model.dropna()

X = df_model[FEATURES]
y = df_model[TARGET]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Dados de Treino: {len(X_train)} amostras")
print(f"Dados de Teste: {len(X_test)} amostras")

Dados de Treino: 800 amostras
Dados de Teste: 200 amostras


In [16]:
def treinar_e_avaliar(modelo, X_treino, y_treino, X_teste, y_teste, nome_modelo):
    """Treina o modelo, faz previsões e calcula as métricas de performance."""

    # Treinamento
    modelo.fit(X_treino, y_treino)

    # Previsão
    y_pred = modelo.predict(X_teste)

    # Métricas
    rmse = np.sqrt(mean_squared_error(y_teste, y_pred))
    r2 = r2_score(y_teste, y_pred)

    print(f"\n--- Resultados para {nome_modelo} ---")
    print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
    print(f"R² (Coeficiente de Determinação): {r2:.4f}")

    return {'Modelo': nome_modelo, 'RMSE': rmse, 'R2': r2}


In [17]:
resultados = []

# 5.1. Regressão Linear
lr = LinearRegression()
res_lr = treinar_e_avaliar(lr, X_train, y_train, X_test, y_test, "Regressão Linear")
resultados.append(res_lr)

# 5.2. Random Forest Regressor
# O Random Forest é um algoritmo de Ensemble que geralmente oferece alta performance
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
res_rf = treinar_e_avaliar(rf, X_train, y_train, X_test, y_test, "Random Forest")
resultados.append(res_rf)

# 5.3. Gradient Boosting Regressor (O terceiro algoritmo)
# O Gradient Boosting também é um algoritmo de Ensemble, robusto e muito usado
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
res_gbr = treinar_e_avaliar(gbr, X_train, y_train, X_test, y_test, "Gradient Boosting")
resultados.append(res_gbr)


--- Resultados para Regressão Linear ---
RMSE (Root Mean Squared Error): 14.9104
R² (Coeficiente de Determinação): -0.0583

--- Resultados para Random Forest ---
RMSE (Root Mean Squared Error): 17.2032
R² (Coeficiente de Determinação): -0.4089

--- Resultados para Gradient Boosting ---
RMSE (Root Mean Squared Error): 15.5454
R² (Coeficiente de Determinação): -0.1504


In [18]:
df_resultados = pd.DataFrame(resultados)

print("\n==============================================")
print("             COMPARAÇÃO FINAL DE MODELOS")
print("==============================================")
print(df_resultados.sort_values(by='R2', ascending=False))


             COMPARAÇÃO FINAL DE MODELOS
              Modelo       RMSE        R2
0   Regressão Linear  14.910421 -0.058349
2  Gradient Boosting  15.545442 -0.150417
1      Random Forest  17.203203 -0.408860
