In [1]:
# Import required libraries
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), '..'))
from src.data.data_loader import load_raw_data, basic_data_info, save_processed_data
from sklearn.preprocessing import OrdinalEncoder


# Set plotting style
plt.style.use('default')
sns.set_palette("viridis")
plt.rcParams['figure.figsize'] = (12, 8)

## üìä Load Data and Prepare Features

In [2]:
# Load the cleaned dataset
try:
    df_clean = pd.read_csv('../../data/processed/cleaned_data.csv')
    print(f"üìä Loaded cleaned dataset: {df_clean.shape}")
except FileNotFoundError:
    print("‚ùå Cleaned dataset not found. Please run previous notebooks first.")
    raise

# Re-categorize features
cat_cols = []
num_cols = []

for col in df_clean.columns:
    if any(prefix in col for prefix in ['cd_', 'zip_', 'year_', 'flg_', 'type', 'city', 'state']):
        cat_cols.append(col)
    else:
        num_cols.append(col)

# Add priority, n_doors, and n_photos to categorical if they exist
for col in ['priority', 'n_doors', 'n_photos']:
    if col in df_clean.columns and col not in cat_cols:
        cat_cols.append(col)
        if col in num_cols:
            num_cols.remove(col)

# Removing Target Variable from features
cat_cols.remove('flg_leads') if 'flg_leads' in cat_cols else None
num_cols.remove('leads') if 'leads' in num_cols else None
target_cols = ['flg_leads', 'leads']
print(f"üìà Numerical features: {len(num_cols)}")
print(f"üè∑Ô∏è Categorical features: {len(cat_cols)}")
print(f"üéØ Target variable: {target_cols}")


üìä Loaded cleaned dataset: (48578, 48)
üìà Numerical features: 5
üè∑Ô∏è Categorical features: 41
üéØ Target variable: ['flg_leads', 'leads']


## Drop Irrelevant Features 

In [3]:
# We will drop this features based on bivariate analysis and domain knowledge, why they have more than 80% of the dominant class.
#  Also the ones that are id features

feat_to_drop = [
    "cd_type_individual",
    "cd_advertise",
    "cd_client",
    "flg_rain_sensor",
    "flg_diesel",
    "flg_eletrico",
    "flg_benzina",
    "flg_pcd",
    "flg_trade_in",
    "flg_armored",
    "flg_factory_warranty",
    "flg_all_dealership_schedule_vehicle",
    "flg_all_dealership_services",
    "flg_single_owner",
    "priority",
]

# Drop irrelevant features
df_clean.drop(columns=feat_to_drop, inplace=True, errors="ignore")

In [4]:
df_clean

Unnamed: 0,leads,views,phone_clicks,cd_vehicle_brand,cd_model_vehicle,cd_version_vehicle,year_model,zip_2dig,vl_advertise,n_photos,...,flg_electric_locks,flg_electric_windows,flg_alloy_wheels,flg_parking_sensor,city,state,flg_leads,flg_alcool,flg_gasolina,flg_gas_natural
0,1,0,0,34,769,346932,2018,75,110990.00,7,...,0,0,0,0,Itumbiara,GO,1,0,0,0
1,1,0,4,2,704,340148,1996,6,8300.00,0,...,1,1,0,0,Osasco,SP,1,1,0,0
2,4,0,11,30,1071,310713,2002,2,38800.00,4,...,1,1,1,0,S√£o Paulo,SP,1,0,1,0
3,12,0,14,10,2028,341195,1995,4,44000.00,8,...,1,1,1,0,S√£o Paulo,SP,1,0,1,0
4,8,0,11,12,805,332063,1995,5,30000.00,6,...,1,1,1,0,S√£o Paulo,SP,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48573,1,34184,0,280,3365,346359,2016,17,48000.00,8,...,1,1,1,1,Ja√∫,SP,1,0,1,0
48574,1,45048,0,4,1177,343882,2013,25,42999.99,1,...,0,0,0,0,Duque de Caxias,RJ,1,1,1,0
48575,1,71020,0,5,3128,344252,2012,13,93900.00,0,...,0,0,0,0,Jundia√≠,SP,1,0,0,0
48576,0,82940,0,2,3323,344421,2014,13,53800.00,0,...,1,1,1,0,Jundia√≠,SP,0,1,1,0


## Clustering flag features

In [5]:
from src.features.feature_engineering import FeatureEngineering, FlagClusteringTransformer

# Initialize FeatureEngineering class
fe = FeatureEngineering()

df_cluster = fe.create_flag_clustering_features(
    data=df_clean,
    target_col="leads",
    flag_cols=[
        "flg_gasolina",
        "flg_electric_locks",
        "flg_air_conditioning",
        "flg_electric_windows",
        "flg_rear_defogger",
        "flg_heater",
        "flg_alarm",
        "flg_airbag",
        "flg_abs",
    ],
)



TypeError: FeatureEngineering.__init__() missing 2 required positional arguments: 'data' and 'target_col'

In [None]:
fe_ = FlagClusteringTransformer(feature_flag_cols = [
        "flg_gasolina",
        "flg_electric_locks",
        "flg_air_conditioning",
        "flg_electric_windows",
        "flg_rear_defogger",
        "flg_heater",
        "flg_alarm",
        "flg_airbag",
        "flg_abs",
    ])

fe_.fit_transform(df_clean)


## Normalize numerical features

In [6]:
num_cols

['views', 'phone_clicks', 'vl_advertise', 'km_vehicle', 'vl_market']

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer
from lightgbm import LGBMRegressor

X = df_clean[num_cols].fillna(0)
y = df_clean['leads']

scalers = {
    'standard': StandardScaler(),
    'minmax': MinMaxScaler(),
    'robust': RobustScaler(),
    'power_yeojohnson': PowerTransformer(method='yeo-johnson')  
}

reg = Ridge(alpha=1.0)  
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, scaler in scalers.items():
    pipe = Pipeline([('scaler', scaler), ('model', reg)])
    # Choose ONE metric at a time or compute several:
    rmse_scores = cross_val_score(pipe, X, y, cv=cv,
                                  scoring='neg_root_mean_squared_error')
    mae_scores  = cross_val_score(pipe, X, y, cv=cv,
                                  scoring='neg_mean_absolute_error')
    r2_scores   = cross_val_score(pipe, X, y, cv=cv, scoring='r2')

    results.append({
        'scaler': name,
        'RMSE': -rmse_scores.mean(),
        'MAE': -mae_scores.mean(),
        'R2':  r2_scores.mean()
    })

pd.DataFrame(results).sort_values('RMSE')


## Encoder for cities and state

In [None]:
df_clean.columns

In [None]:
df_encoder = df_clean[["city", "state", "n_doors", "n_photos", "year_model"]].copy()

df_encoder.describe(include="all")

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


encoder = OneHotEncoder(
    handle_unknown='infrequent_if_exist',
    min_frequency=1000,
    sparse_output=False,   # <- ou sparse=False se sua vers√£o for < 1.2
)

encoded = encoder.fit_transform(df_clean[['city', 'state']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['city', 'state']))

df_final = pd.concat([df_clean[['city','state']].reset_index(drop=True), encoded_df], axis=1)

In [None]:
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold


te = TargetEncoder(cols=['city'], smoothing=0.5,min_samples_leaf=1000)  
y = df_clean['leads']
X_tr_te = te.fit_transform(df_clean[['city']], y)
# # Valida√ß√£o cruzada segura (encoder √© refit a cada fold dentro da pipeline)
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)

In [None]:
te = TargetEncoder(cols=['state'], smoothing=5,min_samples_leaf=500)  # smoothing controla o ‚Äúpull‚Äù √† m√©dia global
# fit_transform OOF com cross_val_predict OU usando pipelines com CV.
y = df_clean['leads']
X_tr_te = te.fit_transform(df_clean[['state']], y)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def analisar_overfit_risco(X, y, col):
    df_temp = pd.DataFrame({col: X[col], 'target': y})
    global_mean = y.mean()

    stats = df_temp.groupby(col)['target'].agg(['mean', 'count']).reset_index()
    stats['desvio'] = (stats['mean'] - global_mean).abs()

    # Crit√©rios de risco (ajust√°veis)
    limite_count = 50
    limite_desvio = 0.2 * global_mean if global_mean > 0 else 0.2
    stats['risco'] = np.where(
        (stats['count'] <= limite_count) & (stats['desvio'] >= limite_desvio),
        'Alto',
        'Baixo'
    )

    # Plot
    plt.figure(figsize=(8,5))
    cores = stats['risco'].map({'Alto': 'red', 'Baixo': 'blue'})
    plt.scatter(stats['count'], stats['mean'], c=cores, alpha=0.7, edgecolor='k')
    plt.axhline(global_mean, color='gray', linestyle='--', label='M√©dia Global')
    plt.xscale('log')
    plt.xlabel('N√∫mero de ocorr√™ncias (log)')
    plt.ylabel('M√©dia codificada')
    plt.title(f'Risco de Overfit - {col}')
    plt.legend()
    plt.show()

    return stats.sort_values(by='risco', ascending=False)

# Uso:
# stats_city = analisar_overfit_risco(X_train, y_train, 'city')
# stats_state = analisar_overfit_risco(X_train, y_train, 'state')
# stats_city = analisar_overfit_risco(X_tr_te, y, 'city')
stats_state = analisar_overfit_risco(X_tr_te, y, 'state')
