In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool, cv
import matplotlib.pyplot as plt

# Load datasets
air_quality_df = pd.read_csv("data/datathon24_parte1/air_quality_gijon.csv")
meteo_df = pd.read_csv("data/datathon24_parte1/meteo_gijon.csv")
mobility_df = pd.read_csv("data/datathon24_parte1/movility_gijon.csv")
prediction_df = pd.read_csv("data/datathon24_parte2/prediction.csv")

# Merge datasets with the selected columns from air_quality_df
selected_columns = ["date", "SO2_GijonGlobal", "NO_GijonGlobal", "NO2_GijonGlobal", "CO_GijonGlobal", "PM10_GijonGlobal", "O3_GijonGlobal"]
filtered_air_quality_df = air_quality_df[selected_columns]
merged_df = filtered_air_quality_df.merge(meteo_df, on="date", how='outer').merge(mobility_df, on="date", how='outer').merge(prediction_df, on="date", how='outer')

# Convert 'date' to datetime, sort, and handle missing values
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df.sort_values('date', inplace=True)
merged_df.ffill(inplace=True)
merged_df.bfill(inplace=True)

# Normalize features and reintroduce cyclical features before normalization
scaler = StandardScaler()
exclude_cols = ["date", "ICA_PM25_target"]
features_to_scale = [col for col in merged_df.columns if col not in exclude_cols]
merged_df[features_to_scale] = scaler.fit_transform(merged_df[features_to_scale])

# Preparing x_final_prediction for December 2023
x_final_prediction_with_date = merged_df[
    (merged_df["date"] >= "2023-12-01") & (merged_df["date"] <= "2023-12-31")
]

# Ensure features are prepared similarly to the training data
x_final_prediction = x_final_prediction_with_date.drop(columns=['date', 'ICA_PM25_target'])

# Apply the same scaler transformation to x_final_prediction
x_final_prediction[features_to_scale] = scaler.transform(x_final_prediction[features_to_scale])

In [7]:
for lag in [1, 2, 3]:
    merged_df[f"PM10_GijonGlobal_lag{lag}"] = merged_df["PM10_GijonGlobal"].shift(
        periods=lag
    )
merged_df.fillna(method="bfill", inplace=True)  # Handling any introduced NaNs

# Splitting the dataset
X = merged_df.drop(["date", "ICA_PM25_target"], axis=1)
y = merged_df["ICA_PM25_target"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Hyperparameter tuning using CatBoost's CV
params = {
    "iterations": 1000,
    "learning_rate": 0.1,
    "depth": 6,
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "early_stopping_rounds": 100,
    "verbose": 200,
}
cv_dataset = Pool(data=X_train, label=y_train)
cv_results = cv(pool=cv_dataset, params=params, fold_count=5, plot=True)

# Training CatBoostClassifier with the best parameters
model = CatBoostClassifier(**params)
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, plot=True)

  df.fillna(method='bfill', inplace=True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CatBoostError: C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/options/json_helper.h:41: Can't parse parameter "learning_rate" with value: [0.01,0.05,0.1]

In [None]:
# Preparing for predictions for December 2023
x_final = merged_df[merged_df['date'] >= '2023-12-01'].drop(['date', 'ICA_PM25_target'], axis=1)
x_final_scaled = scaler.transform(x_final)  # Apply the same scaling
final_predictions = model.predict(x_final_scaled)

# Saving and printing predictions
predictions_string = ','.join(map(str, final_predictions))
print(predictions_string)

# Saving the predictions with date
x_final_prediction_with_date = merged_df[merged_df['date'] >= '2023-12-01'][['date']].copy()
x_final_prediction_with_date['final_predictions'] = final_predictions
x_final_prediction_with_date.to_csv('final_predictions_december_2023.csv', index=False)

0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
