In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import os

# Specify the path to your zip file in Google Drive
zip_file_path = '/content/drive/My Drive/Datasets/ShellAiData.zip' # Replace with the actual path to your zip file

# Specify the directory where you want to extract the contents
extracted_path = '/content' # Replace with your desired extraction path

# Create the extraction directory if it doesn't exist
os.makedirs(extracted_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

print(f'Dataset extracted to {extracted_path}')

Dataset extracted to /content


In [None]:
!pip install lightgbm catboost --quiet

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import joblib

# Load data

In [None]:
df = pd.read_csv('/content/train.csv')
target_cols = [col for col in df.columns if 'Blend' in col]
X = df.drop(columns=target_cols)
y = df[target_cols]

# Feature Engineering

In [None]:
def engineer_features(df):
    df = df.copy()
    for col in df.columns:
        if 'Property' in col or 'fraction' in col:
            df[f'log_{col}'] = np.log1p(df[col])
            df[f'sqrt_{col}'] = np.sqrt(df[col])
    prop_cols = [c for c in df.columns if 'Property' in c]
    df['prop_mean'] = df[prop_cols].mean(axis=1)
    df['prop_std'] = df[prop_cols].std(axis=1)
    return df

X_fe = engineer_features(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_fe)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Torch Dataset

In [None]:
class FuelDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

train_ds = FuelDataset(X_train, y_train)
val_ds = FuelDataset(X_val, y_val)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)

# MLP Model

In [None]:
class MLP(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, out_dim)
        )
    def forward(self, x): return self.net(x)

def mape_loss(pred, target):
    return torch.mean(torch.abs((target - pred) / (target + 1e-8)))

model = MLP(X_scaled.shape[1], y.shape[1])
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(50):
    model.train()
    for xb, yb in train_loader:
        pred = model(xb)
        loss = mape_loss(pred, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    model.eval()
    with torch.no_grad():
        val_loss = mape_loss(model(torch.tensor(X_val, dtype=torch.float32)), torch.tensor(y_val.values, dtype=torch.float32)).item()
    print(f"Epoch {epoch+1}, Val MAPE: {val_loss:.4f}")

# LGBM

In [None]:
lgb_model = LGBMRegressor(n_estimators=300, learning_rate=0.05)
lgb_model.fit(X_scaled, y)
lgb_preds = lgb_model.predict(X_scaled)

# CatBoost

In [None]:
cat_model = CatBoostRegressor(verbose=0, iterations=300, learning_rate=0.05)
cat_model.fit(X_scaled, y)
cat_preds = cat_model.predict(X_scaled)

# Neural Net prediction

In [None]:
model.eval()
nn_preds = model(torch.tensor(X_scaled, dtype=torch.float32)).detach().numpy()

Blend all three

In [None]:
final_preds = (nn_preds + lgb_preds + cat_preds) / 3

# Predict on test.csv

In [None]:
test_df = pd.read_csv('/content/test.csv')
X_test_fe = engineer_features(test_df)[X_fe.columns]
X_test_scaled = scaler.transform(X_test_fe)

nn_test_preds = model(torch.tensor(X_test_scaled, dtype=torch.float32)).detach().numpy()
lgb_test_preds = lgb_model.predict(X_test_scaled)
cat_test_preds = cat_model.predict(X_test_scaled)
test_preds = (nn_test_preds + lgb_test_preds + cat_test_preds) / 3

# Save submission

In [None]:
submission = pd.DataFrame(test_preds, columns=y.columns)
submission.to_csv("/content/final-solution.csv", index=True)

from google.colab import files
files.download("/content/final-solution-ensemble-mlp-lgmb-cat.csv")