In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(r"Your file path to the data file...")

# Display the first 5 rows
df.head()


In [None]:
import pandas as pd
import os

# Load the dataset
df = pd.read_csv(r"Your file path to the data file...")

# Exclude 'geology_id' from interpolation
columns_to_interpolate = df.columns[df.columns != 'geology_id']

# Apply linear interpolation across columns (axis=1)
df[columns_to_interpolate] = df[columns_to_interpolate].interpolate(
    method='linear', axis=1, limit_direction='both'
)

# If any NaNs still remain (e.g., all values were NaN in a row), fallback to column mean
df[columns_to_interpolate] = df[columns_to_interpolate].fillna(df[columns_to_interpolate].mean())

# Define path to save the new CSV
save_folder = r"Your file path to the data file..."
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder, "filled_train.csv")

# Save the filled DataFrame
df.to_csv(save_path, index=False)

# Optionally print the first few rows
df.head()


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pickle
from sklearn.preprocessing import MinMaxScaler

# === Load training data ===
train_path = r"Your file path to the data file..."
df = pd.read_csv(train_path)

# === Extract only columns named '-299' to '0' ===
input_cols = [str(i) for i in range(-299, 1) if str(i) in df.columns]

# Targets are all columns except 'geology_id' and inputs
target_cols = [col for col in df.columns if col not in input_cols + ['geology_id']]

print(f"Number of input columns: {len(input_cols)}")
print(f"Number of target columns: {len(target_cols)}")

# Save target_cols list for inference use
target_cols_path = r"Your file path to the data file..."
with open(target_cols_path, "wb") as f:
    pickle.dump(target_cols, f)
print(f"Saved target columns list to: {target_cols_path}")

# === Normalize inputs and targets ===
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train = scaler_x.fit_transform(df[input_cols].values.astype(np.float32))
Y_train = scaler_y.fit_transform(df[target_cols].values.astype(np.float32))

# Save scalers
with open(r"Your file path to the data file...\scaler_x.pkl", "wb") as f:
    pickle.dump(scaler_x, f)
with open(r"Your file path to the data file...\scaler_y.pkl", "wb") as f:
    pickle.dump(scaler_y, f)

# === Define Improved Autoencoder ===
class Autoencoder(nn.Module):
    def __init__(self, input_dim, bottleneck_dim=300, output_dim=None):
        super(Autoencoder, self).__init__()
        output_dim = output_dim or input_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, bottleneck_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Linear(2048, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# === Training setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Autoencoder(input_dim=X_train.shape[1], output_dim=Y_train.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.SmoothL1Loss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5, verbose=True)

train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(Y_train).float())
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# === Training loop ===
epochs = 300
model.train()
for epoch in range(epochs):
    epoch_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    scheduler.step(avg_loss)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.6f}")

# === Save model ===
save_path = r"Your file path to the data file...\autoencoder_model.pth"
torch.save(model.state_dict(), save_path)
print(f"Model saved to: {save_path}")


In [None]:
import pandas as pd
import os

# Load the test dataset
df = pd.read_csv(r"Your file path to the data file...")

# Exclude the 'geology_id' column from interpolation
columns_to_interpolate = df.columns[df.columns != 'geology_id']

# Apply linear interpolation across columns (axis=1)
df[columns_to_interpolate] = df[columns_to_interpolate].interpolate(
    method='linear', axis=1, limit_direction='both'
)

# Fallback: Fill any remaining NaNs with column means
df[columns_to_interpolate] = df[columns_to_interpolate].fillna(df[columns_to_interpolate].mean())

# Define the path to save the new CSV
save_folder = r"Your file path to the data file..."
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder, "filled_test.csv")

# Save the filled DataFrame
df.to_csv(save_path, index=False)

# Optionally print the first few rows
df.head()


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import pickle

# === Define Improved Autoencoder (matches training) ===
class Autoencoder(nn.Module):
    def __init__(self, input_dim, bottleneck_dim=300, output_dim=None):
        super(Autoencoder, self).__init__()
        output_dim = output_dim or input_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, bottleneck_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Linear(2048, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# === Paths ===
model_path = r"Your file path to the data file...\autoencoder_model.pth"
target_cols_path = r"Your file path to the data file...\target_cols.pkl"
scaler_x_path = r"Your file path to the data file...\scaler_x.pkl"
scaler_y_path = r"Your file path to the data file...\scaler_y.pkl"
test_filled_path = r"Your file path to the data file...\filled_test.csv"
submission_path = r"Your file path to the data file...\submission.csv"

# === Load test data ===
df_test = pd.read_csv(test_filled_path)
geology_ids = df_test["geology_id"].reset_index(drop=True)
input_cols = [str(i) for i in range(-299, 1) if str(i) in df_test.columns]

# === Load saved column list and scalers ===
with open(target_cols_path, "rb") as f:
    target_cols = pickle.load(f)

with open(scaler_x_path, "rb") as f:
    scaler_x = pickle.load(f)

with open(scaler_y_path, "rb") as f:
    scaler_y = pickle.load(f)

# === Normalize input using same scaler from training ===
X_test = df_test[input_cols].values.astype('float32')
X_test_scaled = scaler_x.transform(X_test)

# === Model setup ===
input_dim = len(input_cols)
output_dim = len(target_cols)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Autoencoder(input_dim=input_dim, output_dim=output_dim).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

# === Predict reconstructed output ===
X_test_tensor = torch.tensor(X_test_scaled).to(device)
with torch.no_grad():
    reconstructed = model(X_test_tensor).cpu().numpy()

# === Inverse transform predictions to original scale ===
reconstructed_original = scaler_y.inverse_transform(reconstructed)

# === Create submission dataframe ===
df_reconstructed = pd.DataFrame(reconstructed_original, columns=target_cols)
df_submission = pd.concat([geology_ids, df_reconstructed], axis=1)

# === Save submission ===
df_submission.to_csv(submission_path, index=False)
print(f"Submission file saved to: {submission_path}")
