In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

In [None]:
train_df = pd.read_csv('./Dataset/training_data_v10.csv', encoding="latin1")
test_df = pd.read_csv('./Dataset/test_data_v10.csv', encoding="latin1")

In [None]:
# Remover colunas que não são features
cols_to_drop = ['AVERAGE_SPEED_DIFF', 'AVERAGE_PRECIPITATION','record_date', 'city_name']
X = train_df.drop(columns=[c for c in cols_to_drop if c in train_df.columns])
y = train_df['AVERAGE_SPEED_DIFF']

In [None]:


X_submission = test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns])
# Garantir a mesma ordem das colunas
X_submission = X_submission[X.columns]

In [12]:
# ==============================================================================
# TAREFA 2: Train Test Split
# ==============================================================================
# Split de 25% para validação, com seed 2023 e estratificação
# Como os dados já estão normalizados, usamos o X diretamente
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25, 
    random_state=2023, 
    stratify=y
)

In [13]:
# ==============================================================================
# TAREFA 2: Modelação com GridSearchCV (LightGBM)
# ==============================================================================
lgbm = LGBMClassifier(random_state=2023, verbose=-1)

# Grid de parâmetros otimizado para LightGBM
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50],       # Controla a complexidade da árvore
    'max_depth': [-1, 10, 20],    # -1 significa sem limite
    'subsample': [0.8, 1.0],      # Fração de dados usada por árvore
    'colsample_bytree': [0.8, 1.0] # Fração de features usada por árvore
}

print("A iniciar GridSearchCV para LightGBM...")
grid_searchLGBM = GridSearchCV(lgbm, param_grid, cv=5, refit=True, verbose=2, n_jobs=-1)
grid_searchLGBM.fit(X_train, y_train)

# Melhor modelo
lgbm_best = grid_searchLGBM.best_estimator_
print("\nMelhor estimador:", lgbm_best)

A iniciar GridSearchCV para LightGBM...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Melhor estimador: LGBMClassifier(colsample_bytree=0.8, learning_rate=0.01, n_estimators=300,
               num_leaves=50, random_state=2023, subsample=0.8, verbose=-1)


In [14]:
# ==============================================================================
# TAREFA 2: Avaliação
# ==============================================================================
grid_predictionLGBM = lgbm_best.predict(X_test)

print("\nValidation Accuracy: {:.4f}".format(accuracy_score(y_test, grid_predictionLGBM)))
print("-" * 30)
print("Classification Report:\n")
target_names = ['None', 'Low', 'Medium', 'High', 'Very_High']
print(classification_report(y_test, grid_predictionLGBM, target_names=target_names))

# ==============================================================================
# TAREFA 2: Previsão Final e Submissão
# ==============================================================================
print("\nA gerar previsões para submissão...")

# Previsão direta nos dados de submissão (que já devem estar normalizados)
predictionLGBM = lgbm_best.predict(X_submission)

# Mapeamento inverso
target_map_reverse = {
    0: 'None', 
    1: 'Low', 
    2: 'Medium', 
    3: 'High', 
    4: 'Very_High'
}

prediction_text = [target_map_reverse.get(p, 'None') for p in predictionLGBM]

submission = pd.DataFrame({
    'RowId': range(1, len(prediction_text) + 1),
    'Speed_Diff': prediction_text
})

submission.to_csv('submission_lightgbm_v10.csv', index=False)
print("Ficheiro criado: submission_lightgbm_v10.csv")
print(submission.head())


Validation Accuracy: 0.7980
------------------------------
Classification Report:

              precision    recall  f1-score   support

        None       0.89      0.91      0.90       550
         Low       0.69      0.69      0.69       355
      Medium       0.76      0.77      0.77       413
        High       0.77      0.76      0.76       265
   Very_High       0.89      0.77      0.83       120

    accuracy                           0.80      1703
   macro avg       0.80      0.78      0.79      1703
weighted avg       0.80      0.80      0.80      1703


A gerar previsões para submissão...
Ficheiro criado: submission_lightgbm_v10.csv
   RowId Speed_Diff
0      1       None
1      2        Low
2      3       None
3      4       High
4      5        Low
