# Auto encoder

## Read data

In [1]:
from prepare_data import listTurbines


turbines = listTurbines()
turbines

['CRKALN_001',
 'CRKALN_002',
 'CRKALN_003',
 'CRKALN_004',
 'CRKALN_005',
 'CRKALN_006',
 'CRKALN_007',
 'CRKALN_008',
 'CRKALN_009',
 'CRKALN_010']

In [2]:
from prepare_data import TurbineData


turbineData = TurbineData(turbines[1], verbose=True)
print(turbineData.data3d.shape)
print(turbineData.columns)

(297360, 720, 26)
Index(['avgwindspeed', 'avgpower', 'windspeedstandarddeviation',
       'nacelleposition', 'avgwinddirection', 'availablepowerpublished',
       'avgrotorspeed', 'avgpitchangle', 'avgreactivepower',
       'ambienttemperature', 'avghumidity', 'turbinepressure', 'density',
       'latitude', 'longitude', 'cutoutwindspeed', 'ratedwindspeed',
       'rotordiameter', 'hubheight', 'groundlevelaltitude', 'turbulent',
       'underperformanceprobability', 'overperformanceprobability',
       'turbulentvalid', 'underperformanceprobabilityvalid',
       'overperformanceprobabilityvalid'],
      dtype='object')


### Filter valid data

In [3]:
idPower = turbineData.getIdOfColumn("avgpower")
idWind = turbineData.getIdOfColumn("avgwindspeed")

idPower, idWind

(1, 0)

In [4]:
normalIndices = turbineData.getNormalIndices(
    maxConsecutiveInvalid=5,  # 1 hours of consecutive invalid data
    maxInvalidRate=0.5,
    underperformThreshold=1,  # ignore underperf threshold
)

Valid: 10995
Not underperform: 297360
Normal: 10995


## Split train test

In [5]:
import numpy as np


SEED = 17
TEST_RATIO = 0.2


# suffle data
np.random.seed(SEED)
np.random.shuffle(normalIndices)

# split data
n_test = int(len(normalIndices) * TEST_RATIO)
testIndices = normalIndices[:n_test]
trainIndices = normalIndices[n_test:]

print(len(trainIndices), len(testIndices))

8796 2199


In [None]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

from cls_dataset import trainTestTurbineDataset


targetFeats = [
    "avgwindspeed",
    "avgpower",
    "ambienttemperature",
    "avghumidity",
    "avgrotorspeed",
]
targetFeatIndices = [turbineData.getIdOfColumn(feat) for feat in targetFeats]

underPerfValidId = turbineData.getIdOfColumn("underperformanceprobabilityvalid")


sortedTrainIndices = trainIndices.copy()
sortedTrainIndices.sort()  # h5py requires sorted indices

trainData2d = turbineData.data3d[sortedTrainIndices, 0, :]
# drop invalid 
trainData2d = trainData2d[trainData2d[:, underPerfValidId] == 1]

scalerTrainData = trainData2d[:, targetFeatIndices]
print(scalerTrainData.shape)

stdScaler = StandardScaler()
stdScaler.fit(scalerTrainData)

knn = KNNImputer(n_neighbors=3)
knn.fit(scalerTrainData)

transformer = Pipeline([("scaler", stdScaler), ("knn", knn)])

trainSet, testSet = trainTestTurbineDataset(
    turbineData.data3d,
    trainIndices,
    testIndices,
    targetFeatIndices,
    transformer.transform,
    underPerfValidId,
)

print(f"Data shape: {trainSet[0][0].shape}")

(4853, 5)


In [None]:
## check for nan 
import numpy as np

np.sum(np.isnan(scalerTrainData)) / np.prod(scalerTrainData.shape)

np.float64(0.24117233893110274)

## Model

In [None]:
import torch
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(self, latentDim):
        super().__init__()
        self.latentDim = latentDim
        # 1 learnable embedding for invalid values
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, stride=2, padding=1)  # 5->3
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=2, padding=1)  # 3->2
        self.conv3 = nn.Conv2d(16, 32, kernel_size=2)  # 2->1
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(5728, 2048)
        self.fc2 = nn.Linear(2048, latentDim)

        # debug flag
        self.debug = 0  # print once
    def forward(self, x, mask):
        debugging = False
        if self.debug > 0:
            debugging = True
            self.debug -= 1

        if mask is None:
            raise ValueError("Mask is required")
        xShapeLen = len(x.shape)
        if mask.shape != x.shape[0 : xShapeLen - 1]:
            raise ValueError(
                "Mask shape must cover til x's time steps: "
                + str(mask.shape)
                + " != "
                + str(x.shape[0 : xShapeLen - 1])
            )

        maskAdd1 = mask.unsqueeze(-1).expand_as(x)
        x = x * maskAdd1 + (1 - maskAdd1) * self.maskEmbed
        x= self.conv1(x)
        x = torch.relu(x)
        if debugging:
            print(f"Conv1: {x.shape}")
            
        x = self.conv2(x)
        x = torch.relu(x)
        if debugging:
            print(f"Conv2: {x.shape}")
            
        x = self.conv3(x)
        x = torch.relu(x)
        if debugging:
            print(f"Conv3: {x.shape}")
            
        x = self.flatten(x)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x
class Decoder(nn.Module):
    def __init__(self, latentDim):
        super().__init__()
        self.latentDim = latentDim
        self.fc1 = nn.Linear(latentDim, 2048)
        self.fc2 = nn.Linear(2048, 5728)
        self.conv1 = nn.ConvTranspose2d(32, 16, kernel_size=2)
        self.conv2 = nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.ConvTranspose2d(8, 1, kernel_size=3, stride=2, padding=1)
        self.debug = 0  # print once
    def forward(self, x):
        debugging = False
        if self.debug > 0:
            debugging = True
            self.debug -= 1
        x = self.fc1(x)
        x = torch.relu(x)
        if debugging:
            print(f"FC1: {x.shape}")
        x = self.fc2(x)
        x = torch.relu(x)
        if debugging:
            print(f"FC2: {x.shape}")
        x = x.view(-1, 32, 179, 1)
        x = self.conv1(x)
        x = torch.relu(x)
        if debugging:
            print(f"Conv1: {x.shape}")
        x = self.conv2(x)
        x = torch.relu(x)
        if debugging:
            print(f"Conv2: {x.shape}")
        x = self.conv3(x)
        if debugging:
            print(f"Conv3: {x.shape}")
        x = torch.sigmoid(x)
        return x
class Autoencoder(nn.Module):
    def __init__(self, latent_dim: int):
        super().__init__()
        self.encoder = Encoder(latent_dim)
        self.decoder = Decoder(latent_dim)
    def forward(self, x, mask=None):
        latent = self.encoder(x, mask)
        reconstructed = self.decoder(latent)
        return reconstructed


# test if the model is working
testModel = Autoencoder(1024)
testModel.encoder.debug = 1

# pass a random tensor to the model
x = torch.randn(32, 1, 720, 5)
output = testModel(x, mask=torch.ones(x.shape[:3]))

print(f"Expected output shape: {x.shape}")
print(f"Output shape: {output.shape}")

# print(x[0])
# print(output[0])

del testModel
del x
del output

Conv1: torch.Size([32, 8, 360, 3])
Conv2: torch.Size([32, 16, 180, 2])
Conv3: torch.Size([32, 32, 179, 1])
Expected output shape: torch.Size([32, 1, 720, 5])
Output shape: torch.Size([32, 1, 717, 5])


## Training 

In [None]:
from masking import maskedMseLoss

In [None]:
from matplotlib import pyplot as plt
import torch.optim as optim
from torch.utils.data import DataLoader


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainLoader = DataLoader(trainSet, batch_size=16, shuffle=True, pin_memory=True)
testLoader = DataLoader(testSet, batch_size=16, shuffle=True, pin_memory=True)

model = Autoencoder(1024).to(device)
criterion = maskedMseLoss
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 10
outputs = []
losses = []
for epoch in range(num_epochs):
    for x, mask in trainLoader:
        x, mask = x.float().to(device), mask.float().to(device)

        # Forward pass
        reconstructs = model(x, mask)

        f = open("train.log", "w+")
        print(x, file=f)
        print("=" * 50, file=f)
        print(mask, file=f)
        print("=" * 50, file=f)
        print(reconstructs, file=f)
        exit(0)

        # match reconstructed shape with inputs
        n_timestepsReconstructed = reconstructs.shape[-2]
        x = x[:, :, :n_timestepsReconstructed, :]
        mask = mask[:, :, :n_timestepsReconstructed]

        loss = criterion(reconstructs, x, mask)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
    outputs.append((num_epochs, x.detach().cpu(), reconstructs.detach().cpu()))

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {np.mean(losses):.4f}")

# Save the model
torch.save(model.state_dict(), "autoencoder_cnn.pth")

plt.style.use("fivethirtyeight")
plt.xlabel("Iterations")
plt.ylabel("Loss")

# Plotting the last 100 values

plt.plot(losses[0:100])

NameError: name 'torch' is not defined

In [None]:
import torch 

torch.cuda.get_device_name(0)

'Quadro T2000'