In [None]:
# Step 1: Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Step 2: Change directory to Google Drive

%cd /content/drive/MyDrive/Fall24/Deep_Learning/NitroSolveProject/Data

/content/drive/MyDrive/Fall24/Deep_Learning/NitroSolveProject/Data


In [None]:
# Step 3: Access and work with the dataset
import os
data_dir = "/content/drive/MyDrive/Fall24/Deep_Learning/NitroSolveProject/Data"
files = os.listdir(data_dir)
print(files)

['Test.csv', 'Train.csv', 'Test_cleaned.csv', 'Train_cleaned.csv', 'submission_xgboost_supervised.csv', 'submission_xgboost_supervised1.csv', 'submission_cnn.csv', 'submission_cnn_full.csv']


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import numpy as np

def PreProcessing(train_data, test_data):
    #columns_to_drop = ['ID_Zindi', 'NO2_trop',"LAT","LON"]
    columns_to_drop = ['NO2_trop',"LAT","LON"]
    train_data = train_data.drop(columns=columns_to_drop)
    test_data = test_data.drop(columns=columns_to_drop)

    train_data = train_data.dropna(subset=['GT_NO2'])
    train_data = train_data.dropna(thresh=len(train_data.columns) - 3)
    test_data = test_data.dropna(thresh=len(train_data.columns) - 3)

    train_data['Date'] = pd.to_datetime(train_data['Date'])
    test_data['Date'] = pd.to_datetime(test_data['Date'])

    daily_lst_filled = LST_predict(train_data)
    train_data['LST'] = train_data.apply(lambda row: daily_lst_filled.get(row['Date'], row['LST']) if pd.isnull(row['LST']) else row['LST'], axis=1)
    test_data['LST'] = test_data.apply(lambda row: daily_lst_filled.get(row['Date'], row['LST']) if pd.isnull(row['LST']) else row['LST'], axis=1)

    train_data = train_data.drop_duplicates()
    test_data = test_data.drop_duplicates()

    _, train_data, test_data = scale(train_data, test_data)

    return train_data, test_data


def scale(train_data,test_data):
    numeric_columns = ["Precipitation","LST","AAI","CloudFraction","NO2_strat","NO2_total","TropopausePressure"]
    scaler = MinMaxScaler(feature_range=(-1,1))

    scaler = scaler.fit(train_data[numeric_columns])
    train_data[numeric_columns] = scaler.transform(train_data[numeric_columns])
    test_data[numeric_columns] = scaler.transform(test_data[numeric_columns])

    return scaler,train_data,test_data

def LST_predict(train_data):
    lst_data = train_data.dropna(subset=['LST'])
    daily_lst = lst_data.groupby('Date')['LST'].mean()

    daily_lst = daily_lst.asfreq('D')
    daily_lst_interpolated = daily_lst.interpolate()

    arima_model = ARIMA(daily_lst_interpolated, order=(1, 1, 1))
    arima_fit = arima_model.fit()

    daily_lst_filled = daily_lst.copy()
    daily_lst_filled[daily_lst.isnull()] = arima_fit.predict()[daily_lst.isnull()]

    daily_lst_filled = daily_lst_filled.to_dict()

    return daily_lst_filled


# main
#file_path_Train = 'GEO_AI\Train.csv'
#file_path_Test = 'GEO_AI\Test.csv'
#train_data = pd.read_csv(file_path_Train)
#test_data = pd.read_csv(file_path_Test)

train_data = pd.read_csv(data_dir + "/Train.csv")
test_data = pd.read_csv(data_dir + '/Test.csv')

train_data_cleaned, test_data_cleaned = PreProcessing(train_data, test_data)

print(train_data_cleaned.head())
print(test_data_cleaned.head())

train_data_cleaned.to_csv("Train_cleaned.csv")
test_data_cleaned.to_csv("Test_cleaned.csv")

  train_data['Date'] = pd.to_datetime(train_data['Date'])
  test_data['Date'] = pd.to_datetime(test_data['Date'])
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


    ID_Zindi       Date    ID  Precipitation       LST       AAI  \
0  ID_ENTGC7 2019-01-01  PD01      -1.000000 -0.263243  0.478834   
1  ID_8JCCXC 2019-01-01  PD04      -0.934170 -0.263243  0.395847   
2  ID_V3136Z 2019-01-01  RO01      -1.000000 -0.263243  0.422682   
3  ID_KRVZDJ 2019-01-01  RO02      -0.974067 -0.263243  0.413168   
4  ID_PR351A 2019-01-01  RO03      -0.972466 -0.263243  0.368004   

   CloudFraction  NO2_strat  NO2_total  TropopausePressure  GT_NO2  
0       0.118234  -0.649832  -0.874815           -0.264086    31.0  
1       0.738617  -0.646465  -0.865103           -0.263963    42.0  
2       0.348321  -0.643098  -0.904727           -0.264520    31.0  
3       0.840107  -0.639731  -0.868017           -0.264084    30.0  
4       0.494927  -0.639731  -0.875786           -0.264343    58.0  
    ID_Zindi       Date     ID  Precipitation       LST       AAI  \
0  ID_2MYNQS 2019-01-01   PD03      -0.929197 -0.263243  0.330621   
1  ID_P4U5WU 2019-01-01   TV03      -1.

In [None]:

import pandas as pd

# Load raw train and test data
raw_train_data = pd.read_csv("Train.csv")
raw_test_data = pd.read_csv("Test.csv")

# Combine all IDs from raw train and test data
all_ids = pd.concat([raw_train_data["ID_Zindi"], raw_test_data["ID_Zindi"]], ignore_index=True)

# Save all IDs to a file for later use
all_ids.to_csv("all_ids.csv", index=False, header=["ID_Zindi"])
print(f"All IDs from Train and Test captured and saved to 'all_ids.csv'.")


All IDs from Train and Test captured and saved to 'all_ids.csv'.


In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load cleaned data
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Convert 'Date' to datetime and create date-based features
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract date features
for df in [train_data, test_data]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear

# Drop 'Date' column as it's now redundant
train_data = train_data.drop(columns=['Date'])
test_data = test_data.drop(columns=['Date'])

# Preserve 'ID_Zindi' column for submission purposes
test_IDs = test_data["ID_Zindi"]

# Add placeholder 'GT_NO2' column in test_data for concatenation
test_data['GT_NO2'] = None

# Concatenate train and test data for consistent lagged/rolling feature calculation
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Generate lagged features
for lag in [1, 7, 30]:
    combined_data[f'GT_NO2_lag_{lag}'] = combined_data['GT_NO2'].shift(lag)

# Generate rolling features directly on the column
combined_data['GT_NO2_roll_mean_7'] = combined_data['GT_NO2'].rolling(window=7).mean()
combined_data['GT_NO2_roll_mean_30'] = combined_data['GT_NO2'].rolling(window=30).mean()

# Separate the data back into training and test sets
train_data = combined_data.iloc[:len(train_data)].dropna(subset=["GT_NO2"])  # Drop rows with NaN in target
test_data = combined_data.iloc[len(train_data):].reset_index(drop=True)  # Reset index for clean test data

# Drop ID columns in the feature set for training and testing
train_data = train_data.drop(columns=["ID_Zindi", "ID"])
test_data = test_data.drop(columns=["ID_Zindi", "ID"])

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2"])  # Drop target from features
y = train_data["GT_NO2"]

# Ensure test data has consistent columns with training features
X_test = test_data[X.columns]

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Validate model
y_pred = xgb_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"XGBoost Validation RMSE: {rmse}")

# Predict on test data
test_predictions = xgb_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    "ID_Zindi": test_IDs,
    "GT_NO2": test_predictions
})

submission.to_csv("submission_xgboost_supervised.csv", index=False)
print("XGBoost supervised learning submission file created successfully!")


  combined_data = pd.concat([train_data, test_data], ignore_index=True)


XGBoost Validation RMSE: 9.005203930500716
XGBoost supervised learning submission file created successfully!


In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load cleaned data
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Convert 'Date' to datetime and create date-based features
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract date features
for df in [train_data, test_data]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear

# Drop 'Date' column as it's now redundant
train_data = train_data.drop(columns=['Date'])
test_data = test_data.drop(columns=['Date'])

# Preserve 'ID_Zindi' column for submission purposes
test_IDs = test_data["ID_Zindi"]

# Add placeholder 'GT_NO2' column in test_data for concatenation
test_data['GT_NO2'] = None

# Concatenate train and test data for consistent lagged/rolling feature calculation
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Generate lagged features for numeric columns (specifically 'GT_NO2')
for lag in [1, 7, 30]:
    combined_data[f'GT_NO2_lag_{lag}'] = combined_data['GT_NO2'].shift(lag)

# Generate rolling mean features directly on numeric data
combined_data['GT_NO2_roll_mean_7'] = combined_data['GT_NO2'].rolling(window=7).mean()
combined_data['GT_NO2_roll_mean_30'] = combined_data['GT_NO2'].rolling(window=30).mean()

# Separate the data back into training and test sets without dropping any rows in test data
train_data = combined_data.iloc[:len(train_data)].dropna(subset=["GT_NO2"])  # Drop rows with NaN in target in train data
test_data = combined_data.iloc[len(train_data):].reset_index(drop=True)  # Retain all rows in test data

# Fill missing values in numeric columns in test data only
numeric_cols = test_data.select_dtypes(include=[float, int]).columns
test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())

# Drop ID columns in the feature set for training and testing
train_data = train_data.drop(columns=["ID_Zindi", "ID"])
test_data = test_data.drop(columns=["ID_Zindi", "ID"])

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2"])  # Drop target from features
y = train_data["GT_NO2"]

# Ensure test data has consistent columns with training features
X_test = test_data[X.columns]

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Validate model
y_pred = xgb_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"XGBoost Validation RMSE: {rmse}")

# Predict on test data
test_predictions = xgb_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    "ID_Zindi": test_IDs,
    "GT_NO2": test_predictions
})

submission.to_csv("submission_xgboost_supervised1.csv", index=False)
print("XGBoost supervised learning submission file created successfully!")


  combined_data = pd.concat([train_data, test_data], ignore_index=True)


XGBoost Validation RMSE: 9.005203930500716
XGBoost supervised learning submission file created successfully!


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load processed data with lagged and rolling features
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Separate 'ID_Zindi' for submission
test_IDs = test_data["ID_Zindi"]

# Drop unnecessary columns
train_data = train_data.drop(columns=["ID_Zindi", "ID"], errors='ignore')
test_data = test_data.drop(columns=["ID_Zindi", "ID"], errors='ignore')

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2"])
y = train_data["GT_NO2"]

# Select only numeric columns for scaling
numeric_features = X.select_dtypes(include=[np.number]).columns
X_numeric = X[numeric_features]
X_test_numeric = test_data[numeric_features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

# Reshape data for CNN (samples, timesteps, features)
X_reshaped = X_scaled.reshape(-1, X_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Define CNN model
cnn_model = models.Sequential([
    layers.Conv1D(64, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)),
    layers.Conv1D(32, kernel_size=3, activation="relu"),
    layers.Flatten(),
    layers.Dense(64, activation="relu"),
    layers.Dense(1)
])

# Compile model
cnn_model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Train the model
cnn_model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

# Evaluate on validation data
y_pred = cnn_model.predict(X_val).flatten()
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"CNN Validation RMSE: {rmse}")

# Predict on test data
test_predictions = cnn_model.predict(X_test_reshaped).flatten()

# Create submission file
submission = pd.DataFrame({
    "ID_Zindi": test_IDs,
    "GT_NO2": test_predictions
})

submission.to_csv("submission_cnn.csv", index=False)
print("CNN submission file 'submission_cnn.csv' created successfully!")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 207.5304 - root_mean_squared_error: 14.0901 - val_loss: 123.7027 - val_root_mean_squared_error: 11.1222
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 125.1338 - root_mean_squared_error: 11.1859 - val_loss: 123.0407 - val_root_mean_squared_error: 11.0924
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 122.7334 - root_mean_squared_error: 11.0753 - val_loss: 120.8861 - val_root_mean_squared_error: 10.9948
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 121.1691 - root_mean_squared_error: 11.0069 - val_loss: 122.1310 - val_root_mean_squared_error: 11.0513
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 124.7695 - root_mean_squared_error: 11.1695 - val_loss: 119.5223 - val_root_mean_squared_error: 10.9326
Epoch 6/10



[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
CNN submission file 'submission_cnn.csv' created successfully!


In [None]:
# Load all captured IDs from Step 1
all_ids = pd.read_csv("all_ids.csv")

# Load test data and predictions from the model
test_data = pd.read_csv("Test_cleaned.csv")
submission_predictions = pd.read_csv("submission_cnn.csv")  # Or use predictions directly

# Merge all IDs with model predictions
submission_full = pd.merge(all_ids, submission_predictions, on="ID_Zindi", how="left")

# Fill missing target values with a placeholder (e.g., mean of predictions)
mean_prediction = submission_full["GT_NO2"].mean()
submission_full["GT_NO2"].fillna(mean_prediction, inplace=True)

# Save the final submission file
submission_full.to_csv("submission_final.csv", index=False)
print("Final submission file with all IDs created successfully as 'submission_final.csv'.")


Final submission file with all IDs created successfully as 'submission_final.csv'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission_full["GT_NO2"].fillna(mean_prediction, inplace=True)


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load processed data with lagged and rolling features
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Preserve all IDs for submission purposes
test_IDs = test_data[["ID_Zindi"]]

# Drop unnecessary columns in both train and test sets
train_data = train_data.drop(columns=["ID_Zindi", "ID"], errors='ignore')
test_data = test_data.drop(columns=["ID"], errors='ignore')  # Keep ID_Zindi for submission

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2"])
y = train_data["GT_NO2"]

# Select only numeric columns for scaling, ensuring no rows are dropped
numeric_features = X.select_dtypes(include=[np.number]).columns
X_numeric = X[numeric_features]
X_test_numeric = test_data[numeric_features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

# Reshape data for CNN (samples, timesteps, features)
X_reshaped = X_scaled.reshape(-1, X_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Define CNN model
cnn_model = models.Sequential([
    layers.Conv1D(64, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)),
    layers.Conv1D(32, kernel_size=3, activation="relu"),
    layers.Flatten(),
    layers.Dense(64, activation="relu"),
    layers.Dense(1)
])

# Compile model
cnn_model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Train the model
cnn_model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

# Evaluate on validation data
y_pred = cnn_model.predict(X_val).flatten()
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"CNN Validation RMSE: {rmse}")

# Predict on test data
test_predictions = cnn_model.predict(X_test_reshaped).flatten()

# Ensure all original IDs are preserved in the submission file
submission = pd.DataFrame({
    "ID_Zindi": test_IDs["ID_Zindi"],
    "GT_NO2": test_predictions
})

# Save submission file
submission.to_csv("submission_cnn.csv", index=False)
print("CNN submission file 'submission_cnn.csv' created successfully!")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - loss: 203.6441 - root_mean_squared_error: 13.9931 - val_loss: 124.2527 - val_root_mean_squared_error: 11.1469
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 125.9684 - root_mean_squared_error: 11.2230 - val_loss: 121.7496 - val_root_mean_squared_error: 11.0340
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 126.1584 - root_mean_squared_error: 11.2309 - val_loss: 131.3956 - val_root_mean_squared_error: 11.4628
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 124.4785 - root_mean_squared_error: 11.1566 - val_loss: 119.7261 - val_root_mean_squared_error: 10.9419
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 121.3746 - root_mean_squared_error: 11.0162 - val_loss: 122.1663 - val_root_mean_squared_error: 11.052



[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
CNN submission file 'submission_cnn.csv' created successfully!


In [None]:
num_train_ids = train_data["ID_Zindi"].nunique() if "ID_Zindi" in train_data.columns else len(train_data)
num_test_ids = test_data["ID_Zindi"].nunique() if "ID_Zindi" in test_data.columns else len(test_data)
num_submission_ids = submission["ID_Zindi"].nunique()

# Print the counts
print(f"Number of unique IDs in Train.csv: {num_train_ids}")
print(f"Number of unique IDs in Test.csv: {num_test_ids}")
print(f"Number of unique IDs in submission file: {num_submission_ids}")

Number of unique IDs in Train.csv: 69933
Number of unique IDs in Test.csv: 5708
Number of unique IDs in submission file: 5708


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load original data with all IDs
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Preserve 'ID_Zindi' for the final submission
all_data = pd.concat([train_data, test_data], ignore_index=True)
all_IDs = all_data["ID_Zindi"]

# Drop unnecessary columns
train_data = train_data.drop(columns=["ID_Zindi", "ID"], errors='ignore')
test_data = test_data.drop(columns=["ID_Zindi", "ID"], errors='ignore')

# Separate features and target variable in the training data
X = train_data.drop(columns=["GT_NO2"])
y = train_data["GT_NO2"]

# Combine train and test data for feature scaling and model prediction
X_all = all_data.drop(columns=["GT_NO2", "ID_Zindi", "ID"], errors='ignore')

# Select only numeric columns for scaling
numeric_features = X_all.select_dtypes(include=[np.number]).columns
X_numeric_all = X_all[numeric_features]

# Standardize the combined data
scaler = StandardScaler()
X_scaled_all = scaler.fit_transform(X_numeric_all)

# Reshape data for CNN (samples, timesteps, features)
X_reshaped_all = X_scaled_all.reshape(-1, X_scaled_all.shape[1], 1)

# Split original training data for CNN training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale training and validation data separately for training purposes
X_train_scaled = scaler.transform(X_train[numeric_features])
X_train_reshaped = X_train_scaled.reshape(-1, X_train_scaled.shape[1], 1)
X_val_scaled = scaler.transform(X_val[numeric_features])
X_val_reshaped = X_val_scaled.reshape(-1, X_val_scaled.shape[1], 1)

# Define CNN model
cnn_model = models.Sequential([
    layers.Conv1D(64, kernel_size=3, activation="relu", input_shape=(X_train_reshaped.shape[1], 1)),
    layers.Conv1D(32, kernel_size=3, activation="relu"),
    layers.Flatten(),
    layers.Dense(64, activation="relu"),
    layers.Dense(1)
])

# Compile the model
cnn_model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Train the model
cnn_model.fit(X_train_reshaped, y_train, epochs=10, validation_data=(X_val_reshaped, y_val), batch_size=32)

# Evaluate on validation data
y_val_pred = cnn_model.predict(X_val_reshaped).flatten()
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"CNN Validation RMSE: {rmse}")

# Make predictions on the entire dataset
all_predictions = cnn_model.predict(X_reshaped_all).flatten()

# Fill in GT_NO2 values from the original training data where available, else use predictions
all_data["GT_NO2_Predicted"] = all_predictions
all_data["GT_NO2_Final"] = all_data["GT_NO2"].combine_first(all_data["GT_NO2_Predicted"])

# Prepare final submission
submission = pd.DataFrame({
    "ID_Zindi": all_IDs,
    "GT_NO2": all_data["GT_NO2_Final"]
})

# Save the submission file
submission.to_csv("submission_cnn_full.csv", index=False)
print("CNN submission file 'submission_cnn_full.csv' created successfully!")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - loss: 205.5337 - root_mean_squared_error: 14.0443 - val_loss: 124.1147 - val_root_mean_squared_error: 11.1407
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 126.1994 - root_mean_squared_error: 11.2333 - val_loss: 122.4310 - val_root_mean_squared_error: 11.0649
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 125.4602 - root_mean_squared_error: 11.2002 - val_loss: 120.8644 - val_root_mean_squared_error: 10.9938
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 122.9068 - root_mean_squared_error: 11.0854 - val_loss: 120.0137 - val_root_mean_squared_error: 10.9551
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 121.6372 - root_mean_squared_error: 11.0281 - val_loss: 122.2423 - val_root_mean_squared_error: 11.0563
Epoch 6/10



[1m2364/2364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
CNN submission file 'submission_cnn_full.csv' created successfully!


In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load cleaned data
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Load all captured IDs from Step 1
all_ids = pd.read_csv("all_ids.csv")

# Convert 'Date' to datetime and create date-based features
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract date features
for df in [train_data, test_data]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear

# Drop 'Date' column as it's now redundant
train_data = train_data.drop(columns=['Date'])
test_data = test_data.drop(columns=['Date'])

# Preserve 'ID_Zindi' column for submission purposes
test_IDs = test_data["ID_Zindi"]

# Add placeholder 'GT_NO2' column in test_data for concatenation
test_data['GT_NO2'] = None

# Concatenate train and test data for consistent lagged/rolling feature calculation
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Generate lagged features
for lag in [1, 7, 30]:
    combined_data[f'GT_NO2_lag_{lag}'] = combined_data['GT_NO2'].shift(lag)

# Generate rolling features directly on the column
combined_data['GT_NO2_roll_mean_7'] = combined_data['GT_NO2'].rolling(window=7).mean()
combined_data['GT_NO2_roll_mean_30'] = combined_data['GT_NO2'].rolling(window=30).mean()

# Separate the data back into training and test sets
train_data = combined_data.iloc[:len(train_data)].dropna(subset=["GT_NO2"])  # Drop rows with NaN in target
test_data = combined_data.iloc[len(train_data):].reset_index(drop=True)  # Reset index for clean test data

# Drop ID columns in the feature set for training and testing
train_data = train_data.drop(columns=["ID_Zindi", "ID"])
test_data = test_data.drop(columns=["ID_Zindi", "ID"])

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2"])  # Drop target from features
y = train_data["GT_NO2"]

# Ensure test data has consistent columns with training features
X_test = test_data[X.columns]

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Validate model
y_pred = xgb_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"XGBoost Validation RMSE: {rmse}")

# Predict on test data
test_predictions = xgb_model.predict(X_test)

# Create an initial submission file with predicted test IDs
submission_predictions = pd.DataFrame({
    "ID_Zindi": test_IDs,
    "GT_NO2": test_predictions
})

# Merge with all IDs to ensure all IDs are in the submission
submission_full = pd.merge(all_ids, submission_predictions, on="ID_Zindi", how="left")

# Fill missing GT_NO2 values with the mean prediction
mean_prediction = submission_full["GT_NO2"].mean()
submission_full["GT_NO2"].fillna(mean_prediction, inplace=True)

# Save the final submission file
submission_full.to_csv("submission_xgboost_final.csv", index=False)
print("Final XGBoost submission file created as 'submission_xgboost_final.csv'.")


  combined_data = pd.concat([train_data, test_data], ignore_index=True)


XGBoost Validation RMSE: 9.005203930500716
Final XGBoost submission file created as 'submission_xgboost_final.csv'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission_full["GT_NO2"].fillna(mean_prediction, inplace=True)


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 1: Load Data
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Load all captured IDs
all_ids = pd.read_csv("all_ids.csv")

# Convert 'Date' to datetime and create date-based features
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract date features
for df in [train_data, test_data]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear

# Drop 'Date' column as it's now redundant
train_data = train_data.drop(columns=['Date'])
test_data = test_data.drop(columns=['Date'])

# Preserve 'ID_Zindi' column for submission purposes
test_IDs = test_data["ID_Zindi"]

# Add placeholder 'GT_NO2' column in test_data for concatenation
test_data['GT_NO2'] = None

# Concatenate train and test data for consistent lagged/rolling feature calculation
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Generate lagged features
for lag in [1, 7, 30]:
    combined_data[f'GT_NO2_lag_{lag}'] = combined_data['GT_NO2'].shift(lag)

# Generate rolling features directly on the column
combined_data['GT_NO2_roll_mean_7'] = combined_data['GT_NO2'].rolling(window=7).mean()
combined_data['GT_NO2_roll_mean_30'] = combined_data['GT_NO2'].rolling(window=30).mean()

# Separate the data back into training and test sets
train_data = combined_data.iloc[:len(train_data)].dropna(subset=["GT_NO2"])  # Drop rows with NaN in target
test_data = combined_data.iloc[len(train_data):].reset_index(drop=True)  # Reset index for clean test data

# Drop ID columns in the feature set for training and testing
train_data = train_data.drop(columns=["ID_Zindi", "ID"])
test_data = test_data.drop(columns=["ID_Zindi", "ID"])

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2"])  # Drop target from features
y = train_data["GT_NO2"]

# Ensure test data has consistent columns with training features
X_test = test_data[X.columns]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM (samples, timesteps, features)
X_reshaped = X_scaled.reshape(-1, X_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Define LSTM model
lstm_model = models.Sequential([
    layers.LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    layers.LSTM(32),
    layers.Dense(16, activation="relu"),
    layers.Dense(1)
])

# Compile the model
lstm_model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Train the model
lstm_model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

# Validate the model
y_val_pred = lstm_model.predict(X_val).flatten()

# Deal with NaN in y_val or y_val_pred
if np.isnan(y_val).any() or np.isnan(y_val_pred).any():
    print("NaN values detected in y_val or y_val_pred. Cleaning up...")

    # Find valid indices (non-NaN)
    valid_indices = ~np.isnan(y_val) & ~np.isnan(y_val_pred)

    # Filter y_val and y_val_pred
    y_val_cleaned = y_val[valid_indices]
    y_val_pred_cleaned = y_val_pred[valid_indices]
else:
    # Use as is if no NaN values
    y_val_cleaned = y_val
    y_val_pred_cleaned = y_val_pred

# Calculate RMSE using the cleaned arrays
rmse = mean_squared_error(y_val_cleaned, y_val_pred_cleaned, squared=False)
print(f"LSTM Validation RMSE: {rmse}")

# Predict on test data
test_predictions = lstm_model.predict(X_test_reshaped).flatten()

# Create an initial submission file with predicted test IDs
submission_predictions = pd.DataFrame({
    "ID_Zindi": test_IDs,
    "GT_NO2": test_predictions
})

# Merge with all IDs to ensure all IDs are in the submission
submission_full = pd.merge(all_ids, submission_predictions, on="ID_Zindi", how="left")

# Fill missing GT_NO2 values with the mean prediction
mean_prediction = submission_full["GT_NO2"].mean()
submission_full["GT_NO2"].fillna(mean_prediction, inplace=True)

# Save the final submission file
submission_full.to_csv("submission_lstm_final.csv", index=False)
print("Final LSTM submission file created as 'submission_lstm_final.csv'.")


  combined_data = pd.concat([train_data, test_data], ignore_index=True)
  super().__init__(**kwargs)


Epoch 1/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - loss: nan - root_mean_squared_error: nan - val_loss: nan - val_root_mean_squared_error: nan
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - loss: nan - root_mean_squared_error: nan - val_loss: nan - val_root_mean_squared_error: nan
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - loss: nan - root_mean_squared_error: nan - val_loss: nan - val_root_mean_squared_error: nan
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - loss: nan - root_mean_squared_error: nan - val_loss: nan - val_root_mean_squared_error: nan
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - loss: nan - root_mean_squared_error: nan - val_loss: nan - val_root_mean_squared_error: nan
Epoch 6/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/s



ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load and preprocess data
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Convert 'Date' to datetime and create date-based features
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract date features
for df in [train_data, test_data]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear

# Drop 'Date' column as it's now redundant
train_data = train_data.drop(columns=['Date'])
test_data = test_data.drop(columns=['Date'])

# Remove any rows in train_data with NaN in the target variable
train_data = train_data.dropna(subset=["GT_NO2"])

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2", "ID_Zindi", "ID"], errors="ignore")
y = train_data["GT_NO2"]

# Prepare test features, dropping any non-numeric columns
X_test = test_data.drop(columns=["ID_Zindi", "ID"], errors="ignore")

# Check for NaNs in X, y, and X_test
print(f"NaN values in X: {np.isnan(X).sum().sum()}")
print(f"NaN values in y: {np.isnan(y).sum()}")
print(f"NaN values in X_test: {np.isnan(X_test).sum().sum()}")

# Ensure X and X_test contain only numeric columns
numeric_features = X.select_dtypes(include=[np.number]).columns
X = X[numeric_features]
X_test = X_test[numeric_features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM (samples, timesteps, features)
X_reshaped = X_scaled.reshape(-1, X_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Define LSTM model
lstm_model = models.Sequential([
    layers.LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    layers.LSTM(32),
    layers.Dense(16, activation="relu"),
    layers.Dense(1)
])

# Compile the model
lstm_model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Train the model with validation
history = lstm_model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

# Validate the model with cleaned y_val and y_val_pred (as handled previously)
y_val_pred = lstm_model.predict(X_val).flatten()
valid_indices = ~np.isnan(y_val) & ~np.isnan(y_val_pred)
y_val_cleaned = y_val[valid_indices]
y_val_pred_cleaned = y_val_pred[valid_indices]
rmse = mean_squared_error(y_val_cleaned, y_val_pred_cleaned, squared=False)
print(f"LSTM Validation RMSE: {rmse}")


NaN values in X: 0
NaN values in y: 0
NaN values in X_test: 0


  super().__init__(**kwargs)


Epoch 1/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - loss: 329.6425 - root_mean_squared_error: 18.0433 - val_loss: 200.3697 - val_root_mean_squared_error: 14.1552
Epoch 2/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 194.7017 - root_mean_squared_error: 13.9464 - val_loss: 185.2168 - val_root_mean_squared_error: 13.6094
Epoch 3/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - loss: 154.2992 - root_mean_squared_error: 12.4203 - val_loss: 135.7366 - val_root_mean_squared_error: 11.6506
Epoch 4/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - loss: 133.6549 - root_mean_squared_error: 11.5606 - val_loss: 129.0984 - val_root_mean_squared_error: 11.3621
Epoch 5/10
[1m1749/1749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - loss: 124.7368 - root_mean_squared_error: 11.1654 - val_loss: 122.6263 - val_root_mean_squared_error: 11.



In [None]:
# Predict on test data
test_predictions = lstm_model.predict(X_test_reshaped).flatten()

# Create an initial submission file with predicted test IDs
submission_predictions = pd.DataFrame({
    "ID_Zindi": test_data["ID_Zindi"],
    "GT_NO2": test_predictions
})

# Load all captured IDs
all_ids = pd.read_csv("all_ids.csv")

# Merge with all IDs to ensure all IDs are in the submission
submission_full = pd.merge(all_ids, submission_predictions, on="ID_Zindi", how="left")

# Fill missing GT_NO2 values with the mean prediction
mean_prediction = submission_full["GT_NO2"].mean()
submission_full["GT_NO2"].fillna(mean_prediction, inplace=True)

# Save the final submission file
submission_full.to_csv("submission_lstm_final.csv", index=False)
print("Final LSTM submission file created as 'submission_lstm_final.csv'.")


[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission_full["GT_NO2"].fillna(mean_prediction, inplace=True)


Final LSTM submission file created as 'submission_lstm_final.csv'.


In [None]:
# 3D CNN
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load and preprocess data
train_data = pd.read_csv("Train_cleaned.csv")
test_data = pd.read_csv("Test_cleaned.csv")

# Convert 'Date' to datetime and create date-based features
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract date features
for df in [train_data, test_data]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear

# Drop 'Date' column as it's now redundant
train_data = train_data.drop(columns=['Date'])
test_data = test_data.drop(columns=['Date'])

# Remove any rows in train_data with NaN in the target variable
train_data = train_data.dropna(subset=["GT_NO2"])

# Separate features and target variable
X = train_data.drop(columns=["GT_NO2", "ID_Zindi", "ID"], errors="ignore")
y = train_data["GT_NO2"]

# Prepare test features, dropping any non-numeric columns
X_test = test_data.drop(columns=["ID_Zindi", "ID"], errors="ignore")

# Ensure X and X_test contain only numeric columns
numeric_features = X.select_dtypes(include=[np.number]).columns
X = X[numeric_features]
X_test = X_test[numeric_features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Reshape data for 3D CNN (samples, timesteps, features)
timesteps = 10  # Example: use 10 timesteps
X_reshaped = X_scaled.reshape(-1, timesteps, X_scaled.shape[1] // timesteps)
X_test_reshaped = X_test_scaled.reshape(-1, timesteps, X_test_scaled.shape[1] // timesteps, 1)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Define 3D CNN model
cnn_3d_model = models.Sequential([
    layers.Conv3D(32, kernel_size=(3, 3, 3), activation="relu", input_shape=(X_train.shape[1], X_train.shape[2], 1)),
    layers.MaxPooling3D(pool_size=(2, 2, 2)),
    layers.Conv3D(64, kernel_size=(3, 3, 3), activation="relu"),
    layers.MaxPooling3D(pool_size=(2, 2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation="relu"),
    layers.Dense(1)
])

# Compile the model
cnn_3d_model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Train the model
history = cnn_3d_model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

# Validate the model
y_val_pred = cnn_3d_model.predict(X_val).flatten()
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"3D CNN Validation RMSE: {rmse}")

# Predict on test data
test_predictions = cnn_3d_model.predict(X_test_reshaped).flatten()

# Prepare submission file
test_IDs = test_data["ID_Zindi"]
submission_predictions = pd.DataFrame({
    "ID_Zindi": test_IDs,
    "GT_NO2": test_predictions
})

# Load all captured IDs and merge
all_ids = pd.read_csv("all_ids.csv")
submission_full = pd.merge(all_ids, submission_predictions, on="ID_Zindi", how="left")
submission_full["GT_NO2"].fillna(submission_full["GT_NO2"].mean(), inplace=True)

# Save the final submission file
submission_full.to_csv("submission_3dcnn.csv", index=False)
print("Final 3D CNN submission file created as 'submission_3dcnn.csv'.")


ValueError: cannot reshape array of size 909129 into shape (10,1)