In [1]:
import torch
import delu
import delu.data

# settings
device = torch.device('cuda')
delu.improve_reproducibility(base_seed=1024)

# wandb config
config = {
    "learning_rate": 0.0003,
    "epochs": 1000,
    "batch_size": 32,
    "dropout": 0.2,
    "train_split": 0.8,
    "weight_decay": 0.0,
    "patience": 20
}

In [2]:
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.metrics
import torch
from torch.utils.data import Dataset

import delu
delu.improve_reproducibility(base_seed=1024)

class drdo_data(Dataset):
    def __init__(self, dataset, dep_variable, train_split, device,
                 all_pp: bool=False, no_pp: bool=False, target_pp: bool=False, feature_pp: bool=False) -> None:
        self.train_split = train_split
        self.device = device
        self.dataset = dataset
        self.dep_variable = dep_variable
        self.df = pd.read_csv(dataset)
        self.X = {}
        self.y = {}

        y_all = self.df[dep_variable].astype('float32').to_numpy()
        X_all = self.df.drop(dep_variable, axis=1).astype('float32').to_numpy()

        self.X['train'], self.X['test'], self.y['train'], self.y['test'] = sklearn.model_selection.train_test_split(
            X_all, y_all, train_size=train_split, random_state=1024)

        self.X['train'], self.X['val'], self.y['train'], self.y['val'] = sklearn.model_selection.train_test_split(
            self.X['train'], self.y['train'], train_size=train_split, random_state=1024)
        
        if all_pp == True:
            preprocess = sklearn.preprocessing.QuantileTransformer()
            preprocess.fit(self.X['train'])
            self.X = {k: torch.tensor(preprocess.transform(v), device=device) for k, v in self.X.items()}
            self.y = {k: torch.tensor(v, device=device) for k, v in self.y.items()}
            self.y_mean = self.y['train'].mean().item()
            self.y_std = self.y['train'].std().item()
            self.y = {k: (v - self.y_mean) / self.y_std for k, v in self.y.items()}
        else:
            pass

        if no_pp == True:
            self.X = {k: torch.tensor(v, device=device) for k, v in self.X.items()}
            self.y = {k: torch.tensor(v, device=device) for k, v in self.y.items()}
        else:
            pass

        if target_pp == True:
            self.X = {k: torch.tensor(v, device=device) for k, v in self.X.items()}
            self.y = {k: torch.tensor(v, device=device) for k, v in self.y.items()}
            self.y_mean = self.y['train'].mean().item()
            self.y_std = self.y['train'].std().item()
            self.y = {k: (v - self.y_mean) / self.y_std for k, v in self.y.items()}
        else:
            pass

        if feature_pp == True:
            preprocess = sklearn.preprocessing.QuantileTransformer()
            preprocess.fit(self.X['train'])
            self.X = {k: torch.tensor(preprocess.transform(v), device=device) for k, v in self.X.items()}
            self.y = {k: torch.tensor(v, device=device) for k, v in self.y.items()}
        else:
            pass

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        return self.X['train'][index], self.y['train'][index]


In [3]:
import pandas as pd
df = pd.read_csv('../../data/SIDI_Full.csv')
df.head(3)

Unnamed: 0,LD,Velocity (km/s),a (degrees),$ (degrees),DI
0,1,0.25,90,75,7e-05
1,1,0.25,90,60,0.0058
2,1,0.25,90,45,0.01259


In [4]:
all_pp = drdo_data(
    dataset='../../data/SIDI_Full.csv',
    dep_variable='DI',
    train_split=0.8,
    device=device,
    all_pp=True
)
no_pp = drdo_data(
    dataset='../../data/SIDI_Full.csv',
    dep_variable='DI',
    train_split=0.8,
    device=device,
    no_pp=True
)
target_pp = drdo_data(
    dataset='../../data/SIDI_Full.csv',
    dep_variable='DI',
    train_split=0.8,
    device=device,
    target_pp=True
)
feature_pp = drdo_data(
    dataset='../../data/SIDI_Full.csv',
    dep_variable='DI',
    train_split=0.8,
    device=device,
    feature_pp=True
)
print(f"all_pp:\n{all_pp.X['train']}")
print(f"no_pp:\n{no_pp.X['train']}")
print(f"target_pp:\n{target_pp.X['train']}")
print(f"feature_pp:\n{feature_pp.X['train']}")

all_pp:
tensor([[0.4940, 0.6381, 0.8103, 0.2457],
        [0.4940, 0.2257, 0.3589, 0.7447],
        [0.0000, 0.3699, 0.4339, 0.4154],
        ...,
        [0.0000, 0.3699, 0.1216, 0.0000],
        [0.4940, 0.0000, 0.7352, 0.2457],
        [0.0000, 0.0000, 0.2032, 0.0000]], device='cuda:0')
no_pp:
tensor([[  2.0000,   1.5000,  60.0000,  15.0000],
        [  2.0000,   0.5000, -30.0000,  60.0000],
        [  1.0000,   1.0000, -15.0000,  30.0000],
        ...,
        [  1.0000,   1.0000, -75.0000,   0.0000],
        [  2.0000,   0.2500,  45.0000,  15.0000],
        [  1.0000,   0.2500, -60.0000,   0.0000]], device='cuda:0')
target_pp:
tensor([[  2.0000,   1.5000,  60.0000,  15.0000],
        [  2.0000,   0.5000, -30.0000,  60.0000],
        [  1.0000,   1.0000, -15.0000,  30.0000],
        ...,
        [  1.0000,   1.0000, -75.0000,   0.0000],
        [  2.0000,   0.2500,  45.0000,  15.0000],
        [  1.0000,   0.2500, -60.0000,   0.0000]], device='cuda:0')
feature_pp:
tensor([[0.4940, 

In [12]:
losses = {'all_pp': [], 'no_pp': [], 'target_pp': [], 'feature_pp': []}

In [13]:
# all_pp
import math
import rtdl

data = all_pp
model = rtdl.MLP.make_baseline(
    d_in=data.X['train'].shape[1],
    d_layers=[64,64],
    dropout=config['dropout'],
    d_out=1,
)

model.to(device)
optimizer = (
    torch.optim.AdamW(model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay'])
)
loss_fn = torch.nn.MSELoss()

@torch.no_grad()
def evaluate(part):
    model.eval()
    pred = model(data.X[part]).squeeze(1)
    target = data.y[part]
    score = loss_fn(pred, target)
    return {
        'score': score,
        'pred': pred,
        'target': target
    }

# Create a dataloader for batches of indices
batch_size = config['batch_size']
train_loader = delu.data.make_index_dataloader(len(data.X['train']), config['batch_size'])

# Create a progress tracker for early stopping
progress = delu.ProgressTracker(config['patience'])
print(f'Test score before training: {evaluate("test")["score"]:.4f}')

n_epochs = config['epochs']
for epoch in range(n_epochs):
    for iteration, batch_idx in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        x_batch = data.X['train'][batch_idx]
        y_batch = data.y['train'][batch_idx]
        loss = loss_fn(model(x_batch).squeeze(1), y_batch)
        loss.backward()
        optimizer.step()

    val_score = evaluate('val')
    test_score = evaluate('test')

    valid_rmse = math.sqrt(val_score['score'].cpu().numpy())
    test_rmse = math.sqrt(test_score['score'].cpu().numpy())

    losses['all_pp'].append(test_rmse)

    print(f'Epoch {epoch:03d} | Validation score: {val_score["score"]:.4f} | Test score: {test_score["score"]:.4f}', end='')
    progress.update((-1) * val_score["score"])
    if progress.success:
        print(' <<< BEST VALIDATION EPOCH', end='')
    print()
    if progress.fail:
        break


Test score before training: 0.8074


TypeError: 'float' object is not subscriptable

In [15]:
test_rmse['score']

TypeError: 'float' object is not subscriptable

In [None]:
print(test_score['pred'].shape)
print(test_score['target'].shape)

tmp = pd.DataFrame()
tmp['predictions'] = test_score['pred'].cpu().numpy()
tmp['targets'] = test_score['target'].cpu().numpy()

'''(target - pred)/(target)'''

tmp['error'] = (((test_score['target'].cpu().numpy() - test_score['pred'].cpu().numpy()) / test_score['target'].cpu().numpy()))

In [None]:
tmp.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define the window size for the moving average
window_size = 33

# Calculate the moving average using np.convolve
smoothed_predictions = np.convolve(tmp['predictions'], np.ones(window_size)/window_size, mode='same')
smoothed_targets = np.convolve(tmp['targets'], np.ones(window_size)/window_size, mode='same')

plt.figure(figsize=(16,6))
plt.plot(smoothed_predictions, label='predictions')
plt.plot(smoothed_targets, label='targets')
# plt.plot(tmp['error'], label='error')
plt.legend()
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline

# Assume tmp['predictions'], tmp['targets'], and tmp['error'] are defined

# Define the window size for the moving average
window_size = 33

# Calculate the moving average using np.convolve
smoothed_predictions = np.convolve(tmp['predictions'], np.ones(window_size)/window_size, mode='same')
smoothed_targets = np.convolve(tmp['targets'], np.ones(window_size)/window_size, mode='same')

# Generate smooth curves using make_interp_spline
x = np.arange(len(tmp['predictions']))
x_smooth = np.linspace(x.min(), x.max(), 1000)  # Adjust the number of points for smoother curve
curve_predictions = make_interp_spline(x, smoothed_predictions)(x_smooth)
curve_targets = make_interp_spline(x, smoothed_targets)(x_smooth)

# Plot the original and smoothed data with smooth curves
fig, ax1 = plt.subplots(figsize=(16, 6))

# ax1.plot(tmp['predictions'], label='predictions')
# ax1.plot(tmp['targets'], label='targets')
ax1.plot(x_smooth, curve_predictions, label='smoothed predictions')
ax1.plot(x_smooth, curve_targets, label='smoothed targets')

ax1.set_xlabel('Index')
ax1.set_ylabel('Value')

# Create a secondary y-axis for error
ax2 = ax1.twinx()
ax2.plot(tmp['error'], color='red', label='error')
ax2.set_ylabel('Error')

# Combine the legends from both axes
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2)

plt.grid()
plt.show()


In [None]:
# no_pp
import math
import rtdl

data = no_pp
model = rtdl.MLP.make_baseline(
    d_in=data.X['train'].shape[1],
    d_layers=[64,64],
    dropout=config['dropout'],
    d_out=1,
)

model.to(device)
optimizer = (
    torch.optim.AdamW(model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay'])
)
loss_fn = torch.nn.MSELoss()

@torch.no_grad()
def evaluate(part):
    model.eval()
    pred = model(data.X[part]).squeeze(1)
    target = data.y[part]
    score = loss_fn(pred, target)
    return score

# Create a dataloader for batches of indices
batch_size = config['batch_size']
train_loader = delu.data.make_index_dataloader(len(data.X['train']), config['batch_size'])

# Create a progress tracker for early stopping
progress = delu.ProgressTracker(config['patience'])
print(f'Test score before training: {evaluate("test"):.4f}')

n_epochs = config['epochs']
for epoch in range(n_epochs):
    for iteration, batch_idx in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        x_batch = data.X['train'][batch_idx]
        y_batch = data.y['train'][batch_idx]
        loss = loss_fn(model(x_batch).squeeze(1), y_batch)
        loss.backward()
        optimizer.step()

    val_score = evaluate('val')
    test_score = evaluate('test')

    valid_rmse = math.sqrt(val_score.cpu().numpy())
    test_rmse = math.sqrt(test_score.cpu().numpy())

    losses['no_pp'].append(test_rmse['score'])

    print(f'Epoch {epoch:03d} | Validation score: {val_score:.4f} | Test score: {test_score:.4f}', end='')
    progress.update((-1) * val_score)
    if progress.success:
        print(' <<< BEST VALIDATION EPOCH', end='')
    print()
    if progress.fail:
        break


In [None]:
# target_pp
import math
import rtdl

data = target_pp
model = rtdl.MLP.make_baseline(
    d_in=data.X['train'].shape[1],
    d_layers=[64,64],
    dropout=config['dropout'],
    d_out=1,
)

model.to(device)
optimizer = (
    torch.optim.AdamW(model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay'])
)
loss_fn = torch.nn.MSELoss()

@torch.no_grad()
def evaluate(part):
    model.eval()
    pred = model(data.X[part]).squeeze(1)
    target = data.y[part]
    score = loss_fn(pred, target)
    return score

# Create a dataloader for batches of indices
batch_size = config['batch_size']
train_loader = delu.data.make_index_dataloader(len(data.X['train']), config['batch_size'])

# Create a progress tracker for early stopping
progress = delu.ProgressTracker(config['patience'])
print(f'Test score before training: {evaluate("test"):.4f}')

n_epochs = config['epochs']
for epoch in range(n_epochs):
    for iteration, batch_idx in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        x_batch = data.X['train'][batch_idx]
        y_batch = data.y['train'][batch_idx]
        loss = loss_fn(model(x_batch).squeeze(1), y_batch)
        loss.backward()
        optimizer.step()

    val_score = evaluate('val')
    test_score = evaluate('test')

    valid_rmse = math.sqrt(val_score.cpu().numpy())
    test_rmse = math.sqrt(test_score.cpu().numpy())

    losses['target_pp'].append(test_rmse['score'])

    print(f'Epoch {epoch:03d} | Validation score: {val_score:.4f} | Test score: {test_score:.4f}', end='')
    progress.update((-1) * val_score)
    if progress.success:
        print(' <<< BEST VALIDATION EPOCH', end='')
    print()
    if progress.fail:
        break


In [None]:
# feature_pp
import math
import rtdl

data = feature_pp
model = rtdl.MLP.make_baseline(
    d_in=data.X['train'].shape[1],
    d_layers=[64,64],
    dropout=config['dropout'],
    d_out=1,
)

model.to(device)
optimizer = (
    torch.optim.AdamW(model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay'])
)
loss_fn = torch.nn.MSELoss()

@torch.no_grad()
def evaluate(part):
    model.eval()
    pred = model(data.X[part]).squeeze(1)
    target = data.y[part]
    score = loss_fn(pred, target)
    return score

# Create a dataloader for batches of indices
batch_size = config['batch_size']
train_loader = delu.data.make_index_dataloader(len(data.X['train']), config['batch_size'])

# Create a progress tracker for early stopping
progress = delu.ProgressTracker(config['patience'])
print(f'Test score before training: {evaluate("test"):.4f}')

n_epochs = config['epochs']
for epoch in range(n_epochs):
    for iteration, batch_idx in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        x_batch = data.X['train'][batch_idx]
        y_batch = data.y['train'][batch_idx]
        loss = loss_fn(model(x_batch).squeeze(1), y_batch)
        loss.backward()
        optimizer.step()

    val_score = evaluate('val')
    test_score = evaluate('test')

    valid_rmse = math.sqrt(val_score.cpu().numpy())
    test_rmse = math.sqrt(test_score.cpu().numpy())

    losses['feature_pp'].append(test_rmse['score'])

    print(f'Epoch {epoch:03d} | Validation score: {val_score:.4f} | Test score: {test_score:.4f}', end='')
    progress.update((-1) * val_score)
    if progress.success:
        print(' <<< BEST VALIDATION EPOCH', end='')
    print()
    if progress.fail:
        break


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16,6))
plt.plot(losses['all_pp'], label='all_pp')
plt.plot(losses['no_pp'], label='no_pp')
plt.plot(losses['target_pp'], label='target_pp')
plt.plot(losses['feature_pp'], label='feature_pp')
plt.xlabel('number of epochs')
plt.ylabel('rmse loss')
plt.legend()
plt.grid()
plt.show()