In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

In [2]:
# 1. Carregar os dados
train_v9 = pd.read_csv('./Dataset/training_data_v11.csv', encoding="latin1")
test_v9 = pd.read_csv('./Dataset/test_data_v11.csv', encoding="latin1")

# ==============================================================================
# 1. PREPARA√á√ÉO DOS DADOS
# ==============================================================================
print("üîß A preparar os dados...")

cols_to_drop = ['AVERAGE_SPEED_DIFF', 'record_date', 'city_name']

# Prepara X e y do conjunto de TREINO
X = train_v9.drop(columns=[c for c in cols_to_drop if c in train_v9.columns])
y = train_v9['AVERAGE_SPEED_DIFF']

# Prepara X do conjunto de SUBMISS√ÉO
X_kaggle = test_v9.drop(columns=[c for c in cols_to_drop if c in test_v9.columns])
# Garante a mesma ordem de colunas
X_kaggle = X_kaggle[X.columns] 

# ==============================================================================
# TAREFA 2: Normaliza√ß√£o (MinMaxScaler)
# ==============================================================================
# A Tarefa 2 usa MinMaxScaler(feature_range=(0,1)) em vez de StandardScaler
scaler_X = MinMaxScaler(feature_range=(0,1)).fit(X)

# Transformar treino e teste (mantendo float64 original)
X_normalize = pd.DataFrame(scaler_X.transform(X), columns=X.columns)
X_kaggle_normalize = pd.DataFrame(scaler_X.transform(X_kaggle), columns=X_kaggle.columns)

# ==============================================================================
# TAREFA 2: Train Test Split
# ==============================================================================
# O notebook usa test_size=0.25 e random_state=2023 com stratify
X_train, X_test, y_train, y_test = train_test_split(
    X_normalize, y, 
    test_size=0.25, 
    random_state=2023, 
    stratify=y
)

# ==============================================================================
# TAREFA 2: Modela√ß√£o com GridSearchCV
# ==============================================================================
svc = SVC(random_state=2023)

# Par√¢metros t√≠picos encontrados no notebook da Tarefa 2 para SVM
# Podes ajustar se quiseres testar menos op√ß√µes para ser mais r√°pido
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf'] # ou ['rbf', 'sigmoid']
}

print("A iniciar GridSearchCV para SVM...")
# verbose=3 para veres o progresso, n_jobs=-1 para usar todos os cores
grid_searchSVM = GridSearchCV(svc, param_grid, cv=5, refit=True, verbose=3, n_jobs=-1)
grid_searchSVM.fit(X_train, y_train)

# Melhor modelo
svm_best = grid_searchSVM.best_estimator_
print("\nMelhor estimador:", svm_best)

# ==============================================================================
# TAREFA 2: Avalia√ß√£o
# ==============================================================================
grid_predictionSVM = svm_best.predict(X_test)

print("\nValidation Accuracy: {:.4f}".format(accuracy_score(y_test, grid_predictionSVM)))
print("-" * 30)
print("Classification Report:\n")
# Mapeamento para nomes se o teu y for num√©rico (0,1,2,3,4)
target_names = ['None', 'Low', 'Medium', 'High', 'Very_High']
print(classification_report(y_test, grid_predictionSVM, target_names=target_names))

# Matriz de Confus√£o (Opcional)
# ConfusionMatrixDisplay.from_predictions(y_test, grid_predictionSVM)
# plt.show()

# ==============================================================================
# TAREFA 2: Previs√£o Final e Submiss√£o
# ==============================================================================
print("\nüöÄ A gerar previs√µes para submiss√£o...")

# Usar o melhor modelo do Grid para prever o dataset do Kaggle
predictionSVM = svm_best.predict(X_kaggle_normalize)

# Converter n√∫meros para texto (Reverse Mapping)
reverse_map = {
    0: 'None', 
    1: 'Low', 
    2: 'Medium', 
    3: 'High', 
    4: 'Very_High'
}

# Se a previs√£o vier em n√∫meros, mapeia. Se j√° vier em texto, remove esta linha.
# Assumindo que o treino foi feito com 0,1,2,3,4:
prediction_text = [reverse_map.get(p, 'None') for p in predictionSVM]

submission = pd.DataFrame({
    'RowId': range(1, len(prediction_text) + 1),
    'AVERAGE_SPEED_DIFF': prediction_text
})

submission.to_csv('submission_svm_task2_style.csv', index=False)
print("üíæ Ficheiro 'submission_svm_task2_style.csv' guardado com sucesso!")

üîß A preparar os dados...
A iniciar GridSearchCV para SVM...
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Melhor estimador: SVC(C=100, gamma=0.1, random_state=2023)

Validation Accuracy: 0.7692
------------------------------
Classification Report:

              precision    recall  f1-score   support

        None       0.81      0.94      0.87       550
         Low       0.69      0.55      0.61       355
      Medium       0.74      0.77      0.76       413
        High       0.76      0.72      0.74       265
   Very_High       0.91      0.73      0.81       120

    accuracy                           0.77      1703
   macro avg       0.78      0.74      0.76      1703
weighted avg       0.77      0.77      0.76      1703


üöÄ A gerar previs√µes para submiss√£o...
üíæ Ficheiro 'submission_svm_task2_style.csv' guardado com sucesso!
