In [63]:
import pandas as pd
import os
import polars as pl # memory efficient
import re


pd.options.display.max_columns = None
pd.options.plotting.backend = 'plotly'


In [64]:
class PreProcessing:
    '''
    renames columns to more memorable names. 
    drops avg,min,max, std dev etc.
    drops duplicates.
    '''
    def __init__(self,df:pd.DataFrame,featureDesc:pd.DataFrame):
        self.workingData = PreProcessing.renameColumns(df,featureDesc)
        self._cleanedData = None

    @property
    def cleanedData(self):
        if self._cleanedData is None:
            self._cleanedData = self.cleanData(self.workingData)
        return self._cleanedData
    
    def camelCase(s:str)->str:
        s = re.sub(r'[^a-zA-Z0-9]',' ',s)
        return ''.join([w.capitalize() if idx!=0 else w.lower() for idx,w in enumerate(s.split())])

    def renameColumns(df:pd.DataFrame,featureDesc:pd.DataFrame)->pd.DataFrame:
        featureDesc['description'] = featureDesc['description'].apply(PreProcessing.camelCase)
        mapNames = featureDesc.set_index('sensor_name')['description'].to_dict()

        # dropping std dev, max, min all that shite
        pattern = re.compile(r'(?P<feature>^[a-zA-Z]+(?:_[A-Za-z0-9]+)*?)(?:_(?P<stat>[a-z]{3}))?$')
        keepCols = (
            pd.DataFrame(df.columns.map(lambda x: pattern.match(x).groupdict()).values.tolist())
            .loc[lambda s: s['stat'].isna()|s['stat'].eq('avg')]
            .assign(colName = lambda df: df['feature'] + df['stat'].fillna('').apply(lambda x: f'_{x}' if x else ''))    
            .colName
            .to_list()
        )

        workingData = df[keepCols].copy()
        workingData.columns = workingData.columns.str.replace('_avg','')
        workingData.rename(columns=mapNames,inplace=True)
        assert 0==workingData.columns.duplicated().sum() # no duplicates
        return workingData
    
    def cleanData(self,workingData:pd.DataFrame):
        # dtypes
        workingData['time_stamp'] = pd.to_datetime(workingData['time_stamp'],format='%Y-%m-%d %H:%M:%S')

        # drop duplicates
        workingData.drop_duplicates(subset='time_stamp asset_id'.split(),inplace=True)
        workingData.reset_index(drop=True,inplace=True)

        # drop cols
        workingData.drop(columns='id train_test'.split(),inplace=True)
    

        return workingData

In [65]:
class MergeData:
    '''
    Merges the cleaned data with the event info
    '''
    def __init__(self,cleanedData:pd.DataFrame,eventInfo:pd.DataFrame):
        self._mergedData = self.mergeData(cleanedData,eventInfo)

    @property
    def mergedData(self):
        return self._mergedData

    def mergeData(self,cleanedData:pd.DataFrame,eventInfo:pd.DataFrame):
        eventInfo = self.explodeEventInfo(eventInfo)

        merged = eventInfo.merge(cleanedData,on=['time_stamp','asset_id'],how='right')
        merged.event_label = merged.event_label.fillna('normal')
        merged.event_description = merged.event_description.fillna('unspecified')
        merged.sort_values('asset_id time_stamp'.split(),inplace=True,ignore_index=True)        

        return merged
    
    def explodeEventInfo(self,eventInfo:pd.DataFrame)->pd.DataFrame:
        eventInfo = (
            eventInfo.drop(columns=['event_id','event_start_id','event_end_id'])
            .assign(event_start = lambda df: pd.to_datetime(df['event_start'],format='%d.%m.%Y %H:%M'),
                    event_end = lambda df: pd.to_datetime(df['event_end'],format='%d.%m.%Y %H:%M'),
                    event_range = lambda df: df.apply(lambda x: pd.date_range(x['event_start'],x['event_end'],freq='10min'),axis=1)
                )
            .drop(columns=['event_start','event_end'])
            .explode('event_range')
            .rename(columns={'event_range':'time_stamp','asset':'asset_id'})
            .drop_duplicates(subset='time_stamp asset_id'.split())
        )
        return eventInfo        

In [66]:
direc = '../../data/Care_To_Compare/Wind Farm {farm}'
datasets = os.path.join(direc,'datasets')
csv = os.path.join(datasets,'{number}.csv')

def read_csv(farm:str, number:int)->pd.DataFrame:
    return (
        pl.read_csv(
            csv.format(farm=farm, number=number),
            separator=';'
            ).to_pandas()
        .assign(farm = farm, fileNumber = number)
        .pipe(lambda df: df[['farm','fileNumber'] + df.columns.drop(['farm','fileNumber']).tolist()])
    )

def read_and_concat(farm:str)->pd.DataFrame:
    results = []
    for dataset in os.listdir(datasets.format(farm=farm)):
        number = int(dataset.split('.')[0])
        results.append(read_csv(farm, number))
        print('read file number:', number)
        
    return pd.concat(results,ignore_index=True)

def get_eventInfo(farm:str)->pd.DataFrame:
    return pd.read_csv(os.path.join(direc.format(farm=farm),'event_info.csv'),sep=';')

def get_featureDescription(farm:str)->pd.DataFrame:
    return pd.read_csv(os.path.join(direc.format(farm=farm),'feature_description.csv'),sep=';')



merged_dfs = []
for farm in ['A']:

    df = read_and_concat(farm)
    featureDesc = get_featureDescription(farm)
    eventInfo = get_eventInfo(farm)
    print('read csvs')

    preProcesor= PreProcessing(df,featureDesc)
    print('preprocessed')
    merger = MergeData(preProcesor.cleanedData,eventInfo)
    print('merged')

    mergedData = merger.mergedData
    merged_dfs.append(mergedData.copy())

    del df,featureDesc,eventInfo,preProcesor,merger


read file number: 0
read file number: 10
read file number: 13
read file number: 14
read file number: 17
read file number: 22
read file number: 24
read file number: 25
read file number: 26
read file number: 3
read file number: 38
read file number: 40
read file number: 42
read file number: 45
read file number: 51
read file number: 68
read file number: 69
read file number: 71
read file number: 72
read file number: 73
read file number: 84
read file number: 92
read csvs
preprocessed
merged


In [67]:
A = merged_dfs[0]

A


Unnamed: 0,asset_id,event_label,event_description,time_stamp,farm,fileNumber,status_type_id,ambientTemperature,windAbsoluteDirection,windRelativeDirection,windspeed,estimatedWindspeed,pitchAngle,temperatureInTheHubController,temperatureInTheTopNacelleController,temperatureInTheChokeCoilsOnTheVcsSection,temperatureOnTheVcpBoard,temperatureInTheVcsCoolingWater,temperatureInGearboxBearingOnHighSpeedShaft,temperatureOilInGearbox,temperatureInGeneratorBearing2DriveEnd,temperatureInGeneratorBearing1NonDriveEnd,temperatureInsideGeneratorInStatorWindingsPhase1,temperatureInsideGeneratorInStatorWindingsPhase2,temperatureInsideGeneratorInStatorWindingsPhase3,generatorRpmInLatestPeriod,temperatureInTheSplitRingChamber,temperatureInTheBusbarSection,temperatureMeasuredByTheIgbtDriverOnTheGridSideInverter,actualPhaseDisplacement,averagedCurrentInPhase1,averagedCurrentInPhase2,averagedCurrentInPhase3,gridFrequency,possibleGridCapacitiveReactivePower,possibleGridInductiveReactivePower,possibleGridActivePower,gridPower,gridReactivePower,averagedVoltageInPhase1,averagedVoltageInPhase2,averagedVoltageInPhase3,temperatureMeasuredByTheIgbtDriverOnTheRotorSideInverterPhase1,temperatureMeasuredByTheIgbtDriverOnTheRotorSideInverterPhase2,temperatureMeasuredByTheIgbtDriverOnTheRotorSideInverterPhase3,temperatureInHvTransformerPhaseL1,temperatureInHvTransformerPhaseL2,temperatureInHvTransformerPhaseL3,temperatureOilInHydraulicGroup,nacelleDirection,nacelleTemperature,activePowerGeneratorDisconnected,activePowerGeneratorConnectedInDelta,activePowerGeneratorConnectedInStar,reactivePowerGeneratorDisconnected,reactivePowerGeneratorConnectedInDelta,reactivePowerGeneratorConnectedInStar,totalActivePower,totalReactivePower,rotorRpm,temperatureInTheNoseCone
0,0,normal,unspecified,2022-01-01 00:00:00,A,71,0,18.0,178.7,-18.6,4.1,4.4,-0.3,28.0,38.0,91.0,39.0,38.0,49.0,45.0,39.0,41.0,62.0,62.0,61.0,1254.9,26.0,36.0,39.0,0.8,107.8,134.9,112.1,50.0,0.261707,-0.261707,0.055268,0.054976,-0.034976,402.2,399.9,398.7,39.0,39.0,41.0,71.0,75.0,75.0,32.0,197.3,27.0,0.0,18831.0,0.0,0.0,-11991.0,0.0,18831.0,-11991.0,11.1,20.0
1,0,normal,unspecified,2022-01-01 00:10:00,A,71,0,18.0,191.8,-12.2,4.1,4.3,-0.3,28.0,38.0,92.0,39.0,39.0,49.0,45.0,39.0,42.0,62.0,62.0,62.0,1251.9,26.0,36.0,39.0,0.7,101.1,130.0,102.4,50.0,0.229073,-0.229073,0.046732,0.046585,-0.042195,401.9,399.8,398.4,39.0,39.0,41.0,71.0,75.0,75.0,32.0,203.9,27.0,0.0,15865.0,0.0,0.0,-14360.0,0.0,15865.0,-14360.0,11.1,20.0
2,0,normal,unspecified,2022-01-01 00:20:00,A,71,0,18.0,213.8,16.8,4.1,4.4,-0.3,27.0,38.0,92.0,39.0,39.0,49.0,45.0,39.0,42.0,62.0,62.0,62.0,1251.1,26.0,36.0,39.0,0.7,109.8,139.8,111.7,50.0,0.245610,-0.245610,0.050390,0.050537,-0.045415,401.0,398.9,397.5,39.0,39.0,41.0,71.0,75.0,75.0,32.0,197.0,27.0,0.0,17244.0,0.0,0.0,-15524.0,0.0,17244.0,-15524.0,11.1,20.0
3,0,normal,unspecified,2022-01-01 00:30:00,A,71,0,18.0,199.3,-4.6,4.4,4.6,-0.5,27.0,38.0,92.0,39.0,38.0,49.0,45.0,39.0,42.0,63.0,62.0,62.0,1259.9,26.0,36.0,39.0,0.7,132.7,163.4,136.3,50.0,0.288878,-0.288878,0.065707,0.065512,-0.048146,400.6,398.5,397.2,39.0,39.0,41.0,71.0,75.0,75.0,32.0,204.0,27.0,0.0,22284.0,0.0,0.0,-16444.0,0.0,22284.0,-16444.0,11.2,20.0
4,0,normal,unspecified,2022-01-01 00:40:00,A,71,0,18.0,199.9,-4.0,5.5,5.7,-1.5,27.0,38.0,93.0,39.0,38.0,50.0,46.0,40.0,42.0,63.0,63.0,63.0,1288.2,27.0,36.0,39.0,0.9,242.7,278.8,262.4,50.0,0.486390,-0.486390,0.145268,0.145122,-0.047854,400.2,398.2,396.9,39.0,39.0,41.0,71.0,75.0,75.0,32.0,203.9,27.0,0.0,49587.0,0.0,0.0,-16353.0,0.0,49587.0,-16353.0,11.4,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425200,21,normal,unspecified,2023-10-21 08:00:00,A,72,0,19.0,352.3,5.0,2.0,2.0,24.0,28.0,33.0,30.0,33.0,31.0,37.0,38.0,31.0,29.0,33.0,32.0,33.0,151.5,24.0,28.0,32.0,0.7,3.6,6.7,6.5,50.0,0.000000,0.000000,0.000000,-0.002195,-0.002244,400.0,398.8,396.9,32.0,30.0,30.0,42.0,46.0,42.0,35.0,347.2,27.0,-750.0,0.0,0.0,-757.0,0.0,0.0,-750.0,-757.0,0.4,21.0
425201,21,normal,unspecified,2023-10-21 08:10:00,A,72,0,20.0,339.7,-7.6,1.6,1.6,24.0,28.0,33.0,30.0,33.0,31.0,37.0,38.0,31.0,29.0,33.0,32.0,33.0,126.2,23.0,28.0,32.0,0.7,3.4,6.5,6.0,50.0,0.000000,0.000000,0.000000,-0.002098,-0.002049,399.2,397.9,395.9,32.0,30.0,30.0,42.0,46.0,42.0,35.0,347.2,27.0,-715.0,0.0,0.0,-694.0,0.0,0.0,-715.0,-694.0,0.0,21.0
425202,21,normal,unspecified,2023-10-21 08:20:00,A,72,0,20.0,348.8,1.5,1.8,1.8,24.0,28.0,33.0,29.0,33.0,31.0,37.0,38.0,30.0,28.0,32.0,32.0,33.0,140.6,24.0,28.0,32.0,0.7,3.7,6.8,6.7,50.0,0.000000,0.000000,0.000000,-0.002293,-0.002244,399.1,398.0,395.9,32.0,30.0,30.0,42.0,46.0,42.0,35.0,347.2,27.0,-767.0,0.0,0.0,-765.0,0.0,0.0,-767.0,-765.0,0.0,21.0
425203,21,normal,unspecified,2023-10-21 08:30:00,A,72,0,20.0,356.8,9.5,1.4,1.4,24.0,27.0,33.0,29.0,33.0,31.0,37.0,38.0,30.0,28.0,32.0,32.0,33.0,99.2,23.0,28.0,32.0,0.7,3.3,6.5,5.6,50.0,0.000000,0.000000,0.000000,-0.002000,-0.001902,398.8,397.8,395.8,32.0,30.0,30.0,42.0,46.0,42.0,35.0,347.2,27.0,-693.0,0.0,0.0,-647.0,0.0,0.0,-693.0,-647.0,0.0,21.0


In [68]:
import numpy as np

target = ['event_label']
id_columns = 'asset_id time_stamp event_description farm fileNumber status_type_id'.split()
featureColumns = A.columns.drop(id_columns + target).tolist()

corr = A[featureColumns].corr()

zeroVarianceCols = corr[corr.isna().all(axis = 1)].index
corr.drop(index = zeroVarianceCols,inplace=True)
corr.drop(columns = zeroVarianceCols,inplace=True)

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype('bool')).abs()
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
to_drop = to_drop + zeroVarianceCols.tolist()

del corr, upper

In [69]:
trainingData = A[target + featureColumns].copy()
trainingData.drop(columns = to_drop,inplace=True)

(
    trainingData.drop(columns = 'event_label')
    .corr().style.background_gradient(cmap='coolwarm')
)

Unnamed: 0,ambientTemperature,windAbsoluteDirection,windRelativeDirection,windspeed,pitchAngle,actualPhaseDisplacement,gridFrequency,averagedVoltageInPhase1,temperatureInHvTransformerPhaseL1,temperatureOilInHydraulicGroup
ambientTemperature,1.0,-0.133496,0.005713,0.179277,-0.033624,0.042747,-0.000277,-6.2e-05,0.296008,0.502496
windAbsoluteDirection,-0.133496,1.0,-0.008681,-0.320275,0.019505,-0.213103,0.00081,-0.062251,-0.228037,-0.155717
windRelativeDirection,0.005713,-0.008681,1.0,-0.011243,0.006549,-0.004586,-0.001295,0.003543,0.000864,0.007065
windspeed,0.179277,-0.320275,-0.011243,1.0,-0.240017,0.574772,-0.003987,0.124655,0.609259,0.361168
pitchAngle,-0.033624,0.019505,0.006549,-0.240017,1.0,-0.286069,-0.007707,0.155993,-0.160502,-0.031195
actualPhaseDisplacement,0.042747,-0.213103,-0.004586,0.574772,-0.286069,1.0,-0.000896,-0.001508,0.331765,-0.013471
gridFrequency,-0.000277,0.00081,-0.001295,-0.003987,-0.007707,-0.000896,1.0,0.333084,0.000485,-0.000879
averagedVoltageInPhase1,-6.2e-05,-0.062251,0.003543,0.124655,0.155993,-0.001508,0.333084,1.0,0.258892,0.192634
temperatureInHvTransformerPhaseL1,0.296008,-0.228037,0.000864,0.609259,-0.160502,0.331765,0.000485,0.258892,1.0,0.546383
temperatureOilInHydraulicGroup,0.502496,-0.155717,0.007065,0.361168,-0.031195,-0.013471,-0.000879,0.192634,0.546383,1.0


In [70]:
trainingData['asset_id'] = A['asset_id']
trainingData['time_stamp'] = A['time_stamp']
trainingData

Unnamed: 0,event_label,ambientTemperature,windAbsoluteDirection,windRelativeDirection,windspeed,pitchAngle,actualPhaseDisplacement,gridFrequency,averagedVoltageInPhase1,temperatureInHvTransformerPhaseL1,temperatureOilInHydraulicGroup,asset_id,time_stamp
0,normal,18.0,178.7,-18.6,4.1,-0.3,0.8,50.0,402.2,71.0,32.0,0,2022-01-01 00:00:00
1,normal,18.0,191.8,-12.2,4.1,-0.3,0.7,50.0,401.9,71.0,32.0,0,2022-01-01 00:10:00
2,normal,18.0,213.8,16.8,4.1,-0.3,0.7,50.0,401.0,71.0,32.0,0,2022-01-01 00:20:00
3,normal,18.0,199.3,-4.6,4.4,-0.5,0.7,50.0,400.6,71.0,32.0,0,2022-01-01 00:30:00
4,normal,18.0,199.9,-4.0,5.5,-1.5,0.9,50.0,400.2,71.0,32.0,0,2022-01-01 00:40:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
425200,normal,19.0,352.3,5.0,2.0,24.0,0.7,50.0,400.0,42.0,35.0,21,2023-10-21 08:00:00
425201,normal,20.0,339.7,-7.6,1.6,24.0,0.7,50.0,399.2,42.0,35.0,21,2023-10-21 08:10:00
425202,normal,20.0,348.8,1.5,1.8,24.0,0.7,50.0,399.1,42.0,35.0,21,2023-10-21 08:20:00
425203,normal,20.0,356.8,9.5,1.4,24.0,0.7,50.0,398.8,42.0,35.0,21,2023-10-21 08:30:00


In [71]:
trainingData.event_label = trainingData.event_label.apply(lambda x: 1 if x=='normal' else 0)

In [72]:
#need to filter the data somehow becasue it's too large

trainingData = (
trainingData.groupby('asset_id event_label'.split()).agg(
    time_stamp_max = ('time_stamp',lambda s: s.max() + pd.Timedelta('3days')),
    time_stamp_min = ('time_stamp',lambda s: s.min() - pd.Timedelta('3days'))
    )
    .reset_index()
    .query('event_label==0')
    .drop(columns = 'event_label')
    .merge(trainingData,on='asset_id'.split(),how='left')
    .query('time_stamp>=time_stamp_min')
    .query('time_stamp<=time_stamp_max')
    .drop(columns = 'time_stamp_max time_stamp_min'.split())
    # .plot(x = 'time_stamp',y = 'event_label',facet_row = 'asset_id',kind = 'scatter')
    # .event_label.value_counts(normalize=True)
)
# trainingData
trainingData = trainingData.drop(columns = trainingData.columns[trainingData.isna().any()])

In [73]:
trainingData.to_csv('trainingData.csv',index=False)

In [23]:
import torch
import torch.nn as nn

class CNN_LSTM(nn.Module):
    def __init__(self, num_layers, hidden_size, num_classes):
        super(CNN_LSTM, self).__init__()
        
        # # 1D Convolutional layer for feature extraction
        # self.conv1 = nn.Conv1d(in_channels=num_features, out_channels=64, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        # self.pool = nn.MaxPool1d(kernel_size=2)
        
        # LSTM for sequence modeling
        self.lstm = nn.LSTM(input_size=9, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x = x.permute(0, 2, 1)  # Change shape to (batch, features, time)
        # x = self.pool(self.relu(self.conv1(x)))  
        # x = x.permute(0, 2, 1)  # Back to (batch, time, features) for LSTM
        x, _ = self.lstm(x)  
        x = self.fc(x[:, -1, :])  # Take the last time step's output
        return self.relu(x)


In [24]:
import pandas as pd
trainingData = pd.read_csv('trainingData.csv')

In [25]:
from torch.utils.data import Dataset, DataLoader
import numpy as np

def create_sequences(df, turbine_col, target_col, seq_length=5):
    sequences, labels = [], []
    for _, group in df.groupby(turbine_col):
        group = group.sort_values("time_stamp",ignore_index=True)
        values = group.drop(columns=[turbine_col, target_col,'time_stamp']).values
        targets = group[target_col].values
        for i in range(len(values) - seq_length):
            sequences.append(values[i : i + seq_length])
            labels.append(targets[i + seq_length])
    return np.array(sequences), np.array(labels)


# Create sequences
X, y = create_sequences(trainingData, "asset_id", "event_label")

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_tensor = scaler.fit_transform(X_tensor.reshape(-1, X_tensor.shape[-1])).reshape(X_tensor.shape)
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)

# Dataloader
dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=10_000, shuffle=True)

# Model, loss, and optimizer
model = CNN_LSTM(num_layers=X.shape[2], hidden_size=128, num_classes=1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train model
for epoch in range(10):
    for batch_X, batch_y in dataloader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X).reshape(-1)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 0.8213
Epoch 2, Loss: 0.8166
Epoch 3, Loss: 0.8246
Epoch 4, Loss: 0.8260


KeyboardInterrupt: 