# PROJETO 2


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier



In [2]:
# Base 
df_recomendacao = pd.read_csv('recommendation_logs.csv')
df_recomendacao.head()

Unnamed: 0,recommendation_id,user_id,movie_id,recommendation_date,recommendation_type,recommendation_score,was_clicked,position_in_list,device_type,time_of_day,algorithm_version
0,rec_000001,user_06326,movie_0771,2025-07-03,new_releases,,False,10,Tablet,evening,v1.4
1,rec_000002,user_02180,movie_0985,2024-11-07,genre_based,0.916,False,9,Mobile,evening,v1.4
2,rec_000003,user_03535,movie_0834,2024-08-22,personalized,0.816,False,2,Tablet,evening,v1.4
3,rec_000004,user_05025,movie_0718,2024-04-12,trending,0.771,False,16,Mobile,evening,v1.2
4,rec_000005,user_06794,movie_0718,2025-10-30,similar_users,0.251,False,9,Tablet,evening,v1.3


In [3]:
# Analise Descritiva

df_recomendacao.info()
df_recomendacao.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52000 entries, 0 to 51999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   recommendation_id     52000 non-null  object 
 1   user_id               52000 non-null  object 
 2   movie_id              52000 non-null  object 
 3   recommendation_date   52000 non-null  object 
 4   recommendation_type   52000 non-null  object 
 5   recommendation_score  46784 non-null  float64
 6   was_clicked           52000 non-null  bool   
 7   position_in_list      52000 non-null  int64  
 8   device_type           52000 non-null  object 
 9   time_of_day           52000 non-null  object 
 10  algorithm_version     49380 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 4.0+ MB


Unnamed: 0,recommendation_score,position_in_list
count,46784.0,52000.0
mean,0.551509,10.519365
std,0.260019,5.7645
min,0.1,1.0
25%,0.327,6.0
50%,0.553,11.0
75%,0.776,16.0
max,1.0,20.0


### Pré-processamento & features

In [4]:
#remove IDs
df_recomendacao = df_recomendacao.drop(['recommendation_id', 'user_id', 'movie_id'], axis=1)

In [5]:
# Valores Nulos
df_recomendacao.isnull().sum()

recommendation_date        0
recommendation_type        0
recommendation_score    5216
was_clicked                0
position_in_list           0
device_type                0
time_of_day                0
algorithm_version       2620
dtype: int64

In [6]:
# Tratamento de Valores Nulos
df_recomendacao = df_recomendacao.dropna()

df_recomendacao.isnull().sum()

recommendation_date     0
recommendation_type     0
recommendation_score    0
was_clicked             0
position_in_list        0
device_type             0
time_of_day             0
algorithm_version       0
dtype: int64

In [7]:
# # Classificação

# X = df_recomendacao[['position_in_list', 'device_type', 'time_of_day', 'algorithm_version', 'recommendation_type']]
# Y = df_recomendacao[['was_clicked']]

# # One-Hot Encoding
# X = pd.get_dummies(X, columns=['device_type', 'time_of_day', 'algorithm_version', 'recommendation_type'], drop_first=True)
# X.head()    

### Coluna Target e treinamento / teste

In [8]:
target_col = 'was_clicked'
df = df_recomendacao.copy()

df = df.dropna(subset=[target_col]).copy()
df[target_col] = df[target_col].astype(int)

# Definir features 
feature_cols = [c for c in df.columns if c != target_col]

# Teste / Validacao / Treino
train_df, test_df = train_test_split(
    df, test_size=0.20, random_state=42, stratify=df[target_col]
)
train_df, val_df = train_test_split(
    train_df, test_size=0.25, random_state=42, stratify=train_df[target_col] 
)

# X/Y
X_train, y_train = train_df[feature_cols], train_df[target_col].astype(int).values
X_val,   y_val   = val_df[feature_cols],   val_df[target_col].astype(int).values
X_test,  y_test  = test_df[feature_cols],  test_df[target_col].astype(int).values

print(X_train.shape, X_val.shape, X_test.shape)

(26655, 7) (8885, 7) (8886, 7)


### Selecao dos algoritmos

In [9]:

RANDOM_STATE = 42

grids = {
    "LogReg": (
        LogisticRegression(
            solver="saga",
            max_iter=400,
            class_weight="balanced",
            random_state=RANDOM_STATE
        ),
        {
            "C": [0.1, 1.0, 10.0],          
            "penalty": ["l2"],              
            "fit_intercept": [True, False] 
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
        {
            "n_estimators": [300, 600, 900],   
            "max_depth": [None, 20],           
            "min_samples_leaf": [1, 5]         
        }
    ),
    "HistGB": (
        HistGradientBoostingClassifier(random_state=RANDOM_STATE),
        {
            "learning_rate": [0.05, 0.1, 0.2], 
            "max_depth": [None, 8, 12],        
            "max_leaf_nodes": [31, 63]         
        }
    ),
    "GB": (
        GradientBoostingClassifier(random_state=RANDOM_STATE),
        {
            "n_estimators": [200, 400],        
            "learning_rate": [0.05, 0.1],      
            "max_depth": [3, 5]                
        }
    )
}
