# Auto encoder

## Read data

In [1]:
from prepare_data import listTurbines


turbines = listTurbines()
turbines

['CRKALN_001',
 'CRKALN_002',
 'CRKALN_003',
 'CRKALN_004',
 'CRKALN_005',
 'CRKALN_006',
 'CRKALN_007',
 'CRKALN_008',
 'CRKALN_009',
 'CRKALN_010']

In [2]:
from prepare_data import TurbineData


turbineData = TurbineData(turbines[1], verbose=True)
print(turbineData.data3d.shape)
print(turbineData.columns)

(297360, 720, 26)
Index(['avgwindspeed', 'avgpower', 'windspeedstandarddeviation',
       'nacelleposition', 'avgwinddirection', 'availablepowerpublished',
       'avgrotorspeed', 'avgpitchangle', 'avgreactivepower',
       'ambienttemperature', 'avghumidity', 'turbinepressure', 'density',
       'latitude', 'longitude', 'cutoutwindspeed', 'ratedwindspeed',
       'rotordiameter', 'hubheight', 'groundlevelaltitude', 'turbulent',
       'underperformanceprobability', 'overperformanceprobability',
       'turbulentvalid', 'underperformanceprobabilityvalid',
       'overperformanceprobabilityvalid'],
      dtype='object')


### Filter valid data

In [3]:
idPower = turbineData.getIdOfColumn("avgpower")
idWind = turbineData.getIdOfColumn("avgwindspeed")

idPower, idWind

(1, 0)

In [None]:
normalIndices = turbineData.getNormalIndices(maxConsecutiveInvalid=12)

Valid: 20628
Not underperform: 102565
Normal: 10528


## Split train test

In [11]:
import numpy as np


SEED = 17
TEST_RATIO = 0.2


# suffle data
np.random.seed(SEED)
np.random.shuffle(normalIndices)

# split data
n_test = int(len(normalIndices) * TEST_RATIO)
testIndices = normalIndices[:n_test]
trainIndices = normalIndices[n_test:]

print(len(trainIndices), len(testIndices))

4652 1162


In [12]:
from sklearn.discriminant_analysis import StandardScaler

from cls_dataset import trainTestTurbineDataset


targetFeats = [
    "avgwindspeed",
    "avgpower",
    "ambienttemperature",
    "avghumidity",
    "density",
]
targetFeatIndices = [turbineData.getIdOfColumn(feat) for feat in targetFeats]


stdScaler = StandardScaler()

sortedTrainIndices = trainIndices.copy()
sortedTrainIndices.sort()  # h5py requires sorted indices

scalerTrainData = turbineData.data3d[sortedTrainIndices, 0, :][:, targetFeatIndices]
print(scalerTrainData.shape)

stdScaler.fit(scalerTrainData)


trainSet, testSet = trainTestTurbineDataset(
    turbineData.data3d,
    trainIndices,
    testIndices,
    targetFeatIndices,
    stdScaler.transform,
)

print(f"Data shape: {trainSet[0][0].shape}")

(4652, 5)
Data shape: (1, 720, 5)


In [None]:
## check for nan 
import numpy as np
p
np.sum(np.isnan(scalerTrainData)) / np.prod(scalerTrainData.shape)

np.float64(0.7871883061049011)

## Model

In [7]:
import torch
import torch.nn as nn

from masking import MaskedConv2d


class Encoder(nn.Module):
    def __init__(self, latentDim):
        super().__init__()
        self.latentDim = latentDim
        # 1 learnable embedding for invalid values
        self.maskEmbed = nn.Parameter(torch.zeros(1, 1, 1, 1))
        self.conv1 = MaskedConv2d(1, 8, kernel_size=3, stride=2, padding=1)  # 5->3
        self.conv2 = MaskedConv2d(8, 16, kernel_size=3, stride=2, padding=1)  # 3->2
        self.conv3 = MaskedConv2d(16, 32, kernel_size=2)  # 2->1
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(5728, 2048)
        self.fc2 = nn.Linear(2048, latentDim)
        
        # debug flag
        self.debug = 0  # print once
    def forward(self, x, mask):
        if mask is None:
            raise ValueError("Mask is required")
        xShapeLen = len(x.shape)
        if mask.shape != x.shape[0 : xShapeLen - 1]:
            raise ValueError(
                "Mask shape must cover til x's time steps: "
                + str(mask.shape)
                + " != "
                + str(x.shape[0 : xShapeLen - 1])
            )

        maskAdd1 = mask.unsqueeze(-1).expand_as(x)
        x = x * maskAdd1 + (1 - maskAdd1) * self.maskEmbed
        
        debugging = False
        if self.debug > 0:
            debugging = True
            self.debug -= 1
        x, mask = self.conv1(x, mask)
        x = torch.relu(x)
        if debugging:
            print(f"Conv1: {x.shape}")
        x, mask = self.conv2(x, mask)
        x = torch.relu(x)
        if debugging:
            print(f"Conv2: {x.shape}")
        x, mask = self.conv3(x, mask)
        x = torch.relu(x)
        if debugging:
            print(f"Conv3: {x.shape}")
        x = self.flatten(x)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x
class Decoder(nn.Module):
    def __init__(self, latentDim):
        super().__init__()
        self.latentDim = latentDim
        self.fc1 = nn.Linear(latentDim, 2048)
        self.fc2 = nn.Linear(2048, 5728)
        self.conv1 = nn.ConvTranspose2d(32, 16, kernel_size=2)
        self.conv2 = nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.ConvTranspose2d(8, 1, kernel_size=3, stride=2, padding=1)
        self.debug = 0  # print once
    def forward(self, x):
        debugging = False
        if self.debug > 0:
            debugging = True
            self.debug -= 1
        x = self.fc1(x)
        x = torch.relu(x)
        if debugging:
            print(f"FC1: {x.shape}")
        x = self.fc2(x)
        x = torch.relu(x)
        if debugging:
            print(f"FC2: {x.shape}")
        x = x.view(-1, 32, 179, 1)
        x = self.conv1(x)
        x = torch.relu(x)
        if debugging:
            print(f"Conv1: {x.shape}")
        x = self.conv2(x)
        x = torch.relu(x)
        if debugging:
            print(f"Conv2: {x.shape}")
        x = self.conv3(x)
        if debugging:
            print(f"Conv3: {x.shape}")
        x = torch.sigmoid(x)
        return x
class Autoencoder(nn.Module):
    def __init__(self, latent_dim: int):
        super().__init__()
        self.encoder = Encoder(latent_dim)
        self.decoder = Decoder(latent_dim)
    def forward(self, x, mask=None):
        latent = self.encoder(x, mask)
        reconstructed = self.decoder(latent)
        return reconstructed



# test if the model is working
testModel = Autoencoder(1024)

# pass a random tensor to the model
x = torch.randn(32, 1, 720, 5)
import numpy as np
output = testModel(x, mask=torch.ones(x.shape[:3]))

print(f"Expected output shape: {x.shape}")
print(f"Output shape: {output.shape}")

# print(x[0])
# print(output[0])

del testModel
del x
del output

Expected output shape: torch.Size([32, 1, 720, 5])
Output shape: torch.Size([32, 1, 717, 5])


## Training 

In [8]:
from masking import maskedMseLoss

In [9]:
from matplotlib import pyplot as plt
import torch.optim as optim
from torch.utils.data import DataLoader



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainLoader = DataLoader(trainSet, batch_size=16, shuffle=True, pin_memory=True)
testLoader = DataLoader(testSet, batch_size=16, shuffle=True, pin_memory=True)

model = Autoencoder(1024).to(device)
criterion = maskedMseLoss
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 10
outputs = []
losses = []
for epoch in range(num_epochs):
    for x, mask in trainLoader:
        x, mask = x.float().to(device), mask.float().to(device)

        print(f"X nans: {torch.isnan(x).sum() / x.numel()}")

        # Forward pass
        reconstructs = model(x, mask)
        
        print(f"Reconstructs nans: {torch.isnan(reconstructs).sum() / reconstructs.numel()}")
        
        # match reconstructed shape with inputs
        n_timestepsReconstructed = reconstructs.shape[-2]
        x = x[:, :, :n_timestepsReconstructed, :]
        mask = mask[:, :, :n_timestepsReconstructed]
        
        loss = criterion(reconstructs, x, mask)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
    outputs.append((num_epochs, x.detach().cpu(), reconstructs.detach().cpu()))

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {np.mean(losses):.4f}")

# Save the model
torch.save(model.state_dict(), "autoencoder_cnn.pth")

plt.style.use("fivethirtyeight")
plt.xlabel("Iterations")
plt.ylabel("Loss")

# Plotting the last 100 values
plt.plot(losses[-100:])

  return torch._C._cuda_getDeviceCount() > 0


X nans: 0.8097222447395325
Reconstructs nans: 0.8125
X nans: 0.6875
Reconstructs nans: 1.0
X nans: 0.7493923902511597
Reconstructs nans: 1.0
X nans: 0.8125
Reconstructs nans: 1.0
X nans: 0.625
Reconstructs nans: 1.0
X nans: 0.75
Reconstructs nans: 1.0
X nans: 0.62109375
Reconstructs nans: 1.0
X nans: 0.75
Reconstructs nans: 1.0
X nans: 0.875
Reconstructs nans: 1.0
X nans: 0.75
Reconstructs nans: 1.0
X nans: 0.75
Reconstructs nans: 1.0
X nans: 0.6849826574325562
Reconstructs nans: 1.0
X nans: 0.9375
Reconstructs nans: 1.0
X nans: 0.7469618320465088
Reconstructs nans: 1.0
X nans: 0.8098090291023254
Reconstructs nans: 1.0
X nans: 0.8125
Reconstructs nans: 1.0
X nans: 0.9375
Reconstructs nans: 1.0
X nans: 0.8748263716697693
Reconstructs nans: 1.0
X nans: 0.8120659589767456
Reconstructs nans: 1.0
X nans: 0.6111111044883728
Reconstructs nans: 1.0
X nans: 0.8125
Reconstructs nans: 1.0
X nans: 0.8125
Reconstructs nans: 1.0
X nans: 0.75
Reconstructs nans: 1.0
X nans: 0.875
Reconstructs nans: 1.

KeyboardInterrupt: 

In [47]:
sortedNormalIndices = normalIndices.copy()
sortedNormalIndices.sort()  
normData = turbineData.data3d[sortedNormalIndices, 0, :][:, targetFeatIndices]
np.isnan(normData).sum() , normData.size

(np.int64(23045), 29320)

In [48]:
allData = turbineData.data3d[:, 0, :][:, targetFeatIndices]
np.isnan(allData).sum(), allData.size

(np.int64(41616), 1486800)

In [27]:
x = []
for i in range(16):
    x.append(trainSet[i][0])
x = np.array(x)

print(x.shape)
print(np.isnan(x).sum())

(16, 1, 720, 5)
55715


In [34]:
# exclude transform and reshape
x = []
for i in range(1):
    i = 200
    rowIdx = trainSet.rowIndices[i]
    x.append(testSet.turbineData3d[rowIdx][:, targetFeatIndices])
x = np.array(x)

print(x.shape)
print(np.isnan(x).sum())

(1, 720, 5)
3600


In [35]:
np.savetxt("test.csv", x[0], delimiter=",")

In [None]:
turbineData.close()