In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool, cv
import matplotlib.pyplot as plt

# Load datasets
air_quality_df = pd.read_csv("data/datathon24_parte1/air_quality_gijon.csv")
meteo_df = pd.read_csv("data/datathon24_parte1/meteo_gijon.csv")
mobility_df = pd.read_csv("data/datathon24_parte1/movility_gijon.csv")
prediction_df = pd.read_csv("data/datathon24_parte2/prediction.csv")

# Merge datasets with the selected columns from air_quality_df
selected_columns = ["date", "SO2_GijonGlobal", "NO_GijonGlobal", "NO2_GijonGlobal", "CO_GijonGlobal", "PM10_GijonGlobal", "O3_GijonGlobal"]
filtered_air_quality_df = air_quality_df[selected_columns]
merged_df = filtered_air_quality_df.merge(meteo_df, on="date", how='outer').merge(mobility_df, on="date", how='outer').merge(prediction_df, on="date", how='outer')

# Convert 'date' to datetime, sort, and handle missing values
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df.sort_values('date', inplace=True)
merged_df.ffill(inplace=True)
merged_df.bfill(inplace=True)

# Normalize features and reintroduce cyclical features before normalization
scaler = StandardScaler()
exclude_cols = ["date", "ICA_PM25_target"]
features_to_scale = [col for col in merged_df.columns if col not in exclude_cols]
merged_df[features_to_scale] = scaler.fit_transform(merged_df[features_to_scale])

# Preparing x_final_prediction for December 2023
x_final_prediction_with_date = merged_df[
    (merged_df["date"] >= "2023-12-01") & (merged_df["date"] <= "2023-12-31")
]

# Ensure features are prepared similarly to the training data
x_final_prediction = x_final_prediction_with_date.drop(columns=['date', 'ICA_PM25_target'])

# Apply the same scaler transformation to x_final_prediction
x_final_prediction[features_to_scale] = scaler.transform(x_final_prediction[features_to_scale])

In [41]:
# Splitting the dataset into features and target
X = merged_df.drop(["date", "ICA_PM25_target"], axis=1)
y = merged_df["ICA_PM25_target"]

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.1, random_state=42, stratify=y
)

# Initialize CatBoostClassifier
model = CatBoostClassifier(
    iterations=1000, learning_rate=0.1, depth=6, eval_metric="Accuracy", verbose=200
)

# Train the CatBoostClassifier
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8258362	test: 0.7826087	best: 0.7826087 (0)	total: 8.48ms	remaining: 8.47s
200:	learn: 0.9930796	test: 0.8586957	best: 0.8913043 (18)	total: 1.09s	remaining: 4.32s
400:	learn: 1.0000000	test: 0.8586957	best: 0.8913043 (18)	total: 2.19s	remaining: 3.27s
600:	learn: 1.0000000	test: 0.8478261	best: 0.8913043 (18)	total: 3.21s	remaining: 2.13s
800:	learn: 1.0000000	test: 0.8478261	best: 0.8913043 (18)	total: 4.3s	remaining: 1.07s
999:	learn: 1.0000000	test: 0.8478261	best: 0.8913043 (18)	total: 5.28s	remaining: 0us

bestTest = 0.8913043478
bestIteration = 18

Shrink model to first 19 iterations.


<catboost.core.CatBoostClassifier at 0x2561e1e91d0>

In [40]:
# Prepare the dataset for December 2023 predictions
x_final_prediction = merged_df[merged_df['date'] >= '2023-12-01']
x_final_prediction_scaled = scaler.transform(x_final_prediction.drop(['date', 'ICA_PM25_target'], axis=1))

# Making predictions for December 2023
final_predictions = model.predict(x_final_prediction_scaled)

# Optionally, convert final predictions to a comma-separated string
predictions_string = ','.join(map(str, final_predictions))
print(predictions_string)

# Saving the predictions
x_final_prediction['final_predictions'] = final_predictions
x_final_prediction[['date', 'final_predictions']].to_csv('final_predictions_december_2023.csv', index=False)

0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_final_prediction['final_predictions'] = final_predictions
