In [1]:
import torch


def print_config(config):
    attrs = {}
    attrs.update(config.__dict__)

    for key, value in config.__class__.__dict__.items():
        if not key.startswith('__') and key not in attrs:
            attrs[key] = value

    for key, value in attrs.items():
        print(f"{key}: {value}")


class Config:
    # Model
    model_name = "ridge.month1.only"
    input_dim = 6
    target_dim = 1

    # Device & reproducibility
    device = "cuda" if torch.cuda.is_available() else "cpu"
    seed = 5274

    # Optimization
    optimizer_name = "AdamW"
    lr = 1e-4
    weight_decay = 1e-3

    grad_accum = False
    grad_accum_steps = 1
    batch_size = 32  # // grad_accum_steps

    # Training schedule
    num_epochs = 150
    scheduler_name = "default"

    # Regularization
    dropout = 0.0
    drop_path_rate = 0.0
    label_smoothing = 0.0

    # Experiment tracking
    neptune_token = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
    with_id = ""
    resume = False

config = Config()

In [2]:
import random
import numpy as np
import torch

def setup_reproducibility(config):
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility(config)

In [3]:
import pandas as pd

path = "/Users/arbaaz/Downloads/air_pol/ds"
train_path = path+"/Train.csv"
test_path = path+"/Test.csv"

train_df = pd.read_csv(train_path).drop(columns=['id'])
train_df.dropna(inplace=True)
train_df = train_df[train_df["month"] == 1]

In [4]:
train_df.head()

Unnamed: 0,latitude,longitude,day_of_year,day_of_week,hour,month,pollution_value
11,50.432,7.479,29,0,22,1,26.123001
47,39.708,2.658,29,0,21,1,13.5
49,43.607,122.269,23,3,5,1,39.5
110,42.446,-83.418,19,4,17,1,19.3
271,48.098,16.952,29,0,21,1,15.8


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

input_cols = train_df.columns[:-1]
target_col = train_df.columns[-1]
print(input_cols)
print(target_col)

inputs = scaler.fit_transform(train_df[input_cols])
targets = train_df[target_col].to_numpy().reshape(-1, 1)
inputs = inputs.astype(np.float32)
targets = targets.astype(np.float32)

inputs.shape, targets.shape

Index(['latitude', 'longitude', 'day_of_year', 'day_of_week', 'hour', 'month'], dtype='object')
pollution_value


((334, 6), (334, 1))

In [6]:
from sklearn.model_selection import train_test_split

train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(
    inputs, 
    targets,
    random_state=config.seed
)

train_inputs.shape, train_targets.shape, eval_inputs.shape, eval_targets.shape

((250, 6), (250, 1), (84, 6), (84, 1))

In [7]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=1.0, random_state=config.seed)
model.fit(train_inputs, train_targets)
preds = model.predict(eval_inputs).reshape(-1, 1)
preds.shape

(84, 1)

In [None]:
from sklearn.metrics import root_mean_squared_error


def metric_fn_np(preds, targets):
    rmse = root_mean_squared_error(preds, targets)
    rmse = -(rmse / 100)
    return np.exp(rmse)

metric_fn_np(preds, eval_targets)

np.float64(0.6660068353848443)

In [13]:
test_df = pd.read_csv(test_path)
ids = test_df["id"]
test_df = test_df.drop(columns="id")
test_data = scaler.transform(test_df)
test_data

array([[-4.56533938,  2.51524876,  1.04430853,  1.47609363, -0.1016816 ,
         0.        ],
       [-0.17424082,  2.18474615,  0.77343171, -0.1857556 , -0.52444078,
         0.        ],
       [ 0.49231064, -1.9634207 , -0.4906601 , -0.1857556 , -1.93363806,
         0.        ],
       ...,
       [ 0.43888476,  0.35617256,  1.04430853,  1.47609363, -0.66536051,
         0.        ],
       [-3.73651925,  2.60978448,  1.04430853,  1.47609363, -0.52444078,
         0.        ],
       [-0.16638733,  2.17852213,  0.77343171, -0.1857556 , -0.52444078,
         0.        ]], shape=(2739, 6))

In [16]:
preds = model.predict(test_data).astype(np.float64)
preds.shape

(2739,)

In [17]:
pred_df = pd.DataFrame(ids)
pred_df["pollution_value"] = preds

In [19]:
pred_df.head()

Unnamed: 0,id,pollution_value
0,0,97.301518
1,1,57.268826
2,2,33.553589
3,3,73.030611
4,4,79.278471


In [21]:
pred_df.to_csv("/Users/arbaaz/Downloads/ridge.month1.csv", index=False)