In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import plotly.subplots as sp

In [None]:
# Load Data
data = pd.read_csv('DataImpurityGAN.csv')
# Drop First Column
data = data.drop(columns='Primary ID')

In [None]:
# Drop NAN
data = data.dropna()

In [None]:
df = data

In [None]:
for column in df:
    fig = go.Figure(data=go.Scatter(x=df.index, y=df[column], mode='markers'))
    fig.update_layout(
        title=column,
        xaxis_title="Time",
        yaxis_title="Value",
        autosize=False,
        width=1000,
        height=500,
        margin=dict(
            l=50,
            r=50,
            b=100,
            t=100,
            pad=4
        ),
        paper_bgcolor="LightSteelBlue",
    )
    fig.show()

In [None]:
for column in df:
    plt.figure(figsize=(10, 6))  # Increase the figure size
    plt.scatter(df.index, df[column], s=10)  # Decrease point size by setting s=10
    plt.title(column)  # Set title to be the column name
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.show()

# Here we split the two classes into different dataframes

In [None]:
# Group the data by the "Class" column
grouped_data = data.groupby('Class')

# Create an empty dictionary to store the DataFrames
dfs = {}

# Iterate over each group and store the data in the dictionary
for class_, group in grouped_data:
    dfs[class_] = group.copy()

# Access the separate DataFrames
df_class_1 = dfs[1]
df_class_2 = dfs[2]


In [None]:
# Now that we have two different dataframes we can drop class column
df_class_1 = df_class_1.drop(columns= 'Class')
df_class_2 = df_class_2.drop(columns= 'Class')

In [None]:
df_class_1.shape, df_class_2.shape

In [None]:
# Column Names saved for later when we remake DataFrames
column_names = df_class_2.columns
# column_names

In [None]:
# Here we Scale the data
# Create an instance of StandardScaler
scaler1 = StandardScaler()
scaler2 = StandardScaler()

# Fit the scaler to data
scaler1.fit(df_class_2)
scaler2.fit(df_class_2)

# Transform the data to Standard scale
target_1 = scaler1.transform(df_class_1)
target_2 = scaler2.transform(df_class_2)
target_2.shape, target_1.shape

In [None]:
# Specify length of training data first we train the class 2 since it has more data
num_data = 5000
target_1 = pd.DataFrame(target_2, columns = column_names).iloc[:(num_data-3000)]
target_2 = pd.DataFrame(target_2, columns = column_names).iloc[:num_data]
target_2.shape, target_1.shape

# Parameters

In [None]:
# Input size and output size specify the shape of the tensor fed into the NN
# input and output correspond to the number of features
input_size = len(df_class_2.columns) 
output_size= len(df_class_2.columns)
# The size of the hidden layers within the transformer. This is also the size of the output from the embedding layer
# must be even number
# hidden_size = len(df_class_2.columns)+1
hidden_size = 128
# length of the input sequence, which is the number of historical data points that will be used to predict futere timesteps
seq_length = 10 
# sequence of future data that the model should try to predict
output_steps = 10
# Number of layers in the transformer to stack
num_layers = 5
# Number of heads in the multi-head attention mechanism of the transformer ( hidden_size must be divisible by num_heads)
num_heads = 8
# The dropout rate, a regularization technique
dropout = 0.1
# batch size is the size of data used in training
batch_size = 32
# Learning Rate
lr = 0.0001

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_length, output_steps):
        # Transform data to tensors
        self.data = torch.tensor(data.values, dtype=torch.float32)
        # Past time steps
        self.seq_length = seq_length
        # Future time steps
        self.output_steps = output_steps

    def __len__(self):
        return len(self.data) - self.seq_length - self.output_steps + 1

    def __getitem__(self, index):
        # Returns two tensors one for historical data and the other for future data to predict
        return (self.data[index:index+self.seq_length],
                self.data[index+self.seq_length:index+self.seq_length+self.output_steps])

In [None]:
# instantiate the dataset and dataloader
dataset = TimeSeriesDataset(target_2, seq_length, output_steps)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
class Transformer(nn.Module):
    def __init__(self, input_size, output_size, num_layers, hidden_size, num_heads, dropout, seq_length, n_output_steps):
        super(Transformer, self).__init__()

        self.seq_length = seq_length
        self.n_output_steps = n_output_steps

        self.embedding = nn.Linear(input_size, hidden_size)
        self.pos_encoder = PositionalEncoding(hidden_size, dropout)
        encoder_layer = nn.TransformerEncoderLayer(hidden_size, num_heads, hidden_size, dropout)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

        self.init_weights()

    def init_weights(self):
        init_range = 0.1
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-init_range, init_range)

    def forward(self, input):
        batch_size = input.size(0)
        input = self.embedding(input)  # Embedding
        input = self.pos_encoder(input)  # Positional encoding
        output = self.encoder(input)  # Encoding
        output = self.decoder(output)  # Linear layer
        return output
    
# Define the positional encoding module Allows the transformer to know the unique position of the data
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        # Here we make a matrix of the input data with each having a unique position
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
# Instantiate the model
model = Transformer(input_size, output_size, num_layers, hidden_size, num_heads, dropout, seq_length, output_steps)

# Loss and optimizer
criterion = nn.MSELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)  

In [None]:
# Training loop
num_epochs = 200  # Change this if necessary
model.train()  # Set the model to training mode

# Initialize a list to hold the losses
losses = []

for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(dataloader):
        inputs = inputs
        targets = targets

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Add the loss to the list
    losses.append(loss.item())

    if (epoch) % 10 == 0:  # Print loss every 100 batches
        print (f'Epoch [{epoch}/{num_epochs}],  Loss: {loss.item():.4f}')


In [None]:
def plot_losses(losses):
    plt.figure(figsize=(10,5))
    plt.plot(losses)
    plt.title("Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

In [None]:
plot_losses(losses)

In [None]:
def forecast(model, data, seq_length, output_steps):
    # Ensure the model is in evaluation mode
    model.eval()
    
    # Make sure data is a torch Tensor
    if not isinstance(data, torch.Tensor):
        data = torch.Tensor(data)
        
    # Number of forecasts
    num_forecasts = len(data) - seq_length
    
    # Container for predictions
    predictions = torch.zeros(num_forecasts, output_steps, data.shape[-1])
    
    # Slide over the time-series data
    for i in range(num_forecasts):
        # Get a sequence of data
        seq = data[i:i+seq_length]
        
        # Add an extra dimension for batch
        seq = seq.unsqueeze(0)
        
        # Compute the output
        with torch.no_grad():
            out = model(seq)
        
        # Save only the outputs for the last time step
        predictions[i] = out[0]
    
    return predictions


In [None]:
# This is data used for predicting the next sequence
pred_data = pd.DataFrame(target_2, columns = column_names).iloc[3000:num_data]
pred_data.shape

In [None]:
input_data = torch.tensor(pred_data.values, dtype=torch.float32)

In [None]:
predictions = forecast(model, input_data, seq_length, output_steps)
predictions.shape

In [None]:
predictions

In [None]:
# Inverse scale the prediciton
predictions = scaler2.inverse_transform(predictions[:, -1, :])

In [None]:
test_data = pd.DataFrame(df_class_2, columns = column_names).iloc[num_data:]
test_data.shape

In [None]:
test_array_np = test_data.values

In [None]:
predictions.shape[0]

In [None]:
# Iterate over each feature
for i in range(44):
    plt.figure(figsize=(12, 6))
    
    # Plot the predicted data
    plt.plot(predictions[:, i], label='Predictions')
    
    # Plot the actual test data
    plt.plot(test_array_np[:, i], label='Actual')

    
    plt.title(f'{column_names[i]}')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.show()


In [None]:
# Convert pandas DataFrame to numpy array
test_array_np = test_data.values

# Run PCA on test data and predictions
pca = PCA(n_components=2)
test_pca = pca.fit_transform(test_array_np)
predictions_pca = pca.transform(predictions)

# Plot test data
plt.scatter(test_pca[:, 0], test_pca[:, 1], label='Test data')

# Plot prediction data
plt.scatter(predictions_pca[:, 0], predictions_pca[:, 1], label='Predictions')
plt.title('Class 2 PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


In [None]:
# Convert pandas DataFrame to numpy array
test_array_np = test_data.values

# Run t-SNE on test data and predictions
tsne = TSNE(n_components=2, random_state=0)
test_tsne = tsne.fit_transform(test_array_np)
predictions_tsne = tsne.fit_transform(predictions)

# Plot test data
plt.scatter(test_tsne[:, 0], test_tsne[:, 1], label='Test data')

# Plot prediction data
plt.scatter(predictions_tsne[:, 0], predictions_tsne[:, 1], label='Predictions')
plt.title('Class 2 tSNE')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend()
plt.show()


# Class 1

In [None]:
# instantiate the dataset and dataloader
dataset = TimeSeriesDataset(target_1, seq_length, output_steps)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Instantiate the model
model = Transformer(input_size, output_size, num_layers, hidden_size, num_heads, dropout, seq_length, output_steps)

# Loss and optimizer
criterion = nn.MSELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=lr) 

In [None]:
# Training loop
num_epochs = 200  # Change this if necessary
model.train()  # Set the model to training mode

# Initialize a list to hold the losses
losses = []

for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(dataloader):
        inputs = inputs
        targets = targets

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Add the loss to the list
    losses.append(loss.item())

    if (epoch) % 10 == 0:  
        print (f'Epoch [{epoch}/{num_epochs}],  Loss: {loss.item():.4f}')

In [None]:
plot_losses(losses)

In [None]:
# This is data used for predicting the next sequence
pred_data_1 = pd.DataFrame(target_1, columns = column_names).iloc[1300:2000]
pred_data_1.shape

In [None]:
test_data_1 = pd.DataFrame(df_class_1, columns = column_names).iloc[2000::]
test_data_1.shape

In [None]:
input_data_1 = torch.tensor(pred_data_1.values, dtype=torch.float32)

In [None]:
predictions_1 = forecast(model, input_data_1, seq_length, output_steps)
predictions_1.shape

In [None]:
predictions_1 = scaler1.inverse_transform(predictions_1[:, -1, :])

In [None]:
test_array_np_1 = test_data_1.values

In [None]:
# Iterate over each feature
for i in range(44):
    plt.figure(figsize=(12, 6))
    
    # Plot the predicted data
    plt.plot(predictions_1[:, i], label='Predictions')
    
    # Plot the actual test data
    plt.plot(test_array_np_1[:, i], label='Actual')

    
    plt.title(f'{column_names[i]}')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

In [None]:
# Run PCA on test data and predictions
pca = PCA(n_components=2)
test_pca = pca.fit_transform(test_array_np_1)
predictions_pca = pca.transform(predictions_1)

# Plot test data
plt.scatter(test_pca[:, 0], test_pca[:, 1], label='Test data')

# Plot prediction data
plt.scatter(predictions_pca[:, 0], predictions_pca[:, 1], label='Predictions')
plt.title('Class 1 PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


In [None]:
# Run t-SNE on test data and predictions
tsne = TSNE(n_components=2, random_state=0)
test_tsne = tsne.fit_transform(test_array_np_1)
predictions_tsne = tsne.fit_transform(predictions_1)

# Plot test data
plt.scatter(test_tsne[:, 0], test_tsne[:, 1], label='Test data')

# Plot prediction data
plt.scatter(predictions_tsne[:, 0], predictions_tsne[:, 1], label='Predictions')
plt.title('Class 1 tSNE')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend()
plt.show()


In [None]:
# Concatenate the generated data
generated_predictions = np.concatenate((predictions, predictions_1), axis=0)
generated_predictions.shape

In [None]:
# Concatenate Test Data
test_contcatenate = np.concatenate((test_array_np, test_array_np_1), axis=0)
test_contcatenate.shape

In [None]:
# Run PCA on test data and predictions
pca = PCA(n_components=2)
test_pca = pca.fit_transform(test_contcatenate)
predictions_pca = pca.transform(generated_predictions)

# Plot test data
plt.scatter(test_pca[:, 0], test_pca[:, 1], label='Test data')

# Plot prediction data
plt.scatter(predictions_pca[:, 0], predictions_pca[:, 1], label='Predictions')
plt.title('Overall PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

In [None]:

# Run t-SNE on test data and predictions
tsne = TSNE(n_components=2, random_state=0)
test_tsne = tsne.fit_transform(test_contcatenate)
predictions_tsne = tsne.fit_transform(generated_predictions)

# Plot test data
plt.scatter(test_tsne[:, 0], test_tsne[:, 1], label='Test data')

# Plot prediction data
plt.scatter(predictions_tsne[:, 0], predictions_tsne[:, 1], label='Predictions')
plt.title('Overall tSNE')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend()
plt.show()