In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm  # Import tqdm for progress bars (notebook version)

# Carregar os Datasets:
df_present = pd.read_csv('/Users/luryan/Documents/to_bix/Dataset/air_system_present_year.csv')
df_previous = pd.read_csv('/Users/luryan/Documents/to_bix/Dataset/air_system_previous_years.csv')

In [41]:
# Converter 'class' para Binário:
df_previous['class_binary'] = df_previous['class'].apply(lambda x: 1 if x == 'pos' else 0)
df_present['class_binary'] = df_present['class'].apply(lambda x: 1 if x == 'pos' else 0)

# Substituir "na" por 0:
df_previous = df_previous.replace('na', 0)
df_present = df_present.replace('na', 0)

# Selecionar Features:
selected_columns = ['bx_000', 'bv_000', 'ci_000', 'bu_000', 'ba_000', 'az_005', 'bb_000', 'cc_000']

# Preparar os Dados para Treinamento:
X = df_previous[selected_columns]
y = df_previous['class_binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
# Aplicar SMOTE para Balancear as Classes:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Padronizar os Dados:
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Correção: Definir y_present e aplicar o scaler em X_present:
X_present = df_present[selected_columns]
y_present = df_present['class_binary']
X_present = scaler.transform(X_present)

In [44]:
# Definir o Espaço de Hiperparâmetros para Grid Search:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
}

# Criar e Treinar o Modelo (Random Forest) com Grid Search:
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')

# Initialize tqdm to track GridSearchCV progress
with tqdm(total=len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['min_samples_split'])) as pbar:
    grid_search.fit(X_train_resampled, y_train_resampled)
    pbar.update()

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [None]:
# Obter o Melhor Modelo e Fazer Previsões:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_present = best_model.predict(X_present)

# Avaliar o Modelo:
print("Métricas no Conjunto de Teste:")
print(f"Acurácia: {accuracy_score(y_test, y_pred)}")
print(f"Precisão: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1-Score: {f1_score(y_test, y_pred)}")
print(f"AUC: {roc_auc_score(y_test, y_pred)}")

print("\nMétricas no Conjunto 'Present Year':")
print(f"Acurácia: {accuracy_score(y_present, y_pred_present)}")
print(f"Precisão: {precision_score(y_present, y_pred_present)}")
print(f"Recall: {recall_score(y_present, y_pred_present)}")
print(f"F1-Score: {f1_score(y_present, y_pred_present)}")
print(f"AUC: {roc_auc_score(y_present, y_pred_present)}")

# Simular Custos de Manutenção:
custo_total_real = df_present['class_binary'].sum() * 500 
custo_total_simulado = 0
for i in tqdm(range(len(df_present)), desc='Calculando Custos'):
    if y_pred_present[i] == 1 and y_present[i] == 1:
        custo_total_simulado += 25
    elif y_pred_present[i] == 1 and y_present[i] == 0: 
        custo_total_simulado += 10
    elif y_pred_present[i] == 0 and y_present[i] == 1:
        custo_total_simulado += 500

print("\nCusto Total Real: $", custo_total_real)
print("Custo Total Simulado com o Modelo: $", custo_total_simulado)
print("Economia Potencial: $", custo_total_real - custo_total_simulado)