In [1]:
import torch


def print_config(config):
    attrs = {}
    attrs.update(config.__dict__)

    for key, value in config.__class__.__dict__.items():
        if not key.startswith('__') and key not in attrs:
            attrs[key] = value

    for key, value in attrs.items():
        print(f"{key}: {value}")


class Config:
    # Model
    model_name = "ridge.month1.only"
    input_dim = 6
    target_dim = 1

    # Device & reproducibility
    device = "cuda" if torch.cuda.is_available() else "cpu"
    seed = 5274

    # Optimization
    optimizer_name = "AdamW"
    lr = 1e-4
    weight_decay = 1e-3

    grad_accum = False
    grad_accum_steps = 1
    batch_size = 32  # // grad_accum_steps

    # Training schedule
    num_epochs = 150
    scheduler_name = "default"

    # Regularization
    dropout = 0.0
    drop_path_rate = 0.0
    label_smoothing = 0.0

    # Experiment tracking
    neptune_token = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
    with_id = ""
    resume = False

config = Config()

In [2]:
import random
import numpy as np
import torch

def setup_reproducibility(config):
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility(config)

In [3]:
import pandas as pd

path = "/Users/arbaaz/Downloads/air_pol/ds"
train_path = path+"/Train.csv"
test_path = path+"/Test.csv"

train_df = pd.read_csv(train_path).drop(columns=['id'])
train_df.dropna(inplace=True)

In [4]:
train_df.head()

Unnamed: 0,latitude,longitude,day_of_year,day_of_week,hour,month,pollution_value
0,51.491,-0.172,301,6,3,10,14.5
1,39.386,-121.158,254,3,21,9,34.5
2,51.459,0.596,301,6,3,10,10.5
3,35.299,-120.613,145,2,14,5,15.5
4,29.927,120.527,221,0,14,8,54.5


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

input_cols = train_df.columns[:-1]
target_col = train_df.columns[-1]
print(input_cols)
print(target_col)

inputs = scaler.fit_transform(train_df[input_cols])
targets = train_df[target_col].to_numpy().reshape(-1, 1)
inputs = inputs.astype(np.float32)
targets = targets.astype(np.float32)

inputs.shape, targets.shape

Index(['latitude', 'longitude', 'day_of_year', 'day_of_week', 'hour', 'month'], dtype='object')
pollution_value


((7636, 6), (7636, 1))

In [6]:
from sklearn.model_selection import train_test_split

train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(
    inputs, 
    targets,
    random_state=config.seed, 
    train_size=0.8
)

train_inputs.shape, train_targets.shape, eval_inputs.shape, eval_targets.shape

((6108, 6), (6108, 1), (1528, 6), (1528, 1))

In [7]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=1.0, random_state=config.seed)
model.fit(train_inputs, train_targets)
preds = model.predict(eval_inputs).reshape(-1, 1)
preds.shape

(1528, 1)

In [8]:
from sklearn.metrics import root_mean_squared_error


def metric_fn_np(preds, targets):
    rmse = root_mean_squared_error(preds, targets)
    rmse = -(rmse / 100)
    return np.exp(rmse)

metric_fn_np(preds, eval_targets)

np.float64(0.544894455758783)

In [9]:
test_df = pd.read_csv(test_path)
ids = test_df["id"]
test_df = test_df.drop(columns="id")
test_data = scaler.transform(test_df)
test_data

array([[-4.17046527,  1.17857751, -2.15887654,  1.35983274,  0.63096159,
        -2.31938321],
       [ 0.2906513 ,  0.97443741, -2.19641603, -0.31279658,  0.15007301,
        -2.31938321],
       [ 0.96783123, -1.58774267, -2.37160034, -0.31279658, -1.45288894,
        -2.31938321],
       ...,
       [ 0.91355345, -0.15500964, -2.15887654,  1.35983274, -0.01022319,
        -2.31938321],
       [-3.32842924,  1.23696898, -2.15887654,  1.35983274,  0.15007301,
        -2.31938321],
       [ 0.29863003,  0.97059304, -2.19641603, -0.31279658,  0.15007301,
        -2.31938321]], shape=(2739, 6))

In [10]:
preds = model.predict(test_data).astype(np.float64)
preds.shape

(2739,)

In [11]:
pred_df = pd.DataFrame(ids)
pred_df["pollution_value"] = preds

In [12]:
pred_df.head()

Unnamed: 0,id,pollution_value
0,0,38.724467
1,1,24.468424
2,2,9.863706
3,3,20.260076
4,4,23.684753


In [13]:
pred_df.to_csv("/Users/arbaaz/Downloads/ridge.csv", index=False)