<a href="https://colab.research.google.com/github/AnanyaTayalSC/Drafts/blob/main/3layersdraft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.cluster import KMeans

# =====================================
# 1. Load CSV and build tensor
# =====================================
from google.colab import drive
drive.mount('/content/drive')

csv_file_path = '/content/drive/MyDrive/Colab Notebooks/Thesis/Imp.csv'
df = pd.read_csv(csv_file_path)  # columns: sample_id, time, var1, var2, var3

# Ensure sorting (important!)
df = df.sort_values(by=["sample_id", "time"])

# Pivot to wide format: each (sample_id, time) → row
#X = df.pivot(index=["sample_id", "time"], columns=None, values=["var1","var2","var3"]).reset_index()

# Group by sample_id and collect sequences
X_list = []
for sid, group in df.groupby("sample_id"):
    values = group[["var1","var2","var3"]].values  # shape (T, 3)
    X_list.append(values)

X = np.stack(X_list)  # final shape (N, T, 3)
print("Data shape:", X.shape)  # e.g. (1394, 100, 3)

# Convert to PyTorch tensor
X_tensor = torch.tensor(X, dtype=torch.float32)


# =====================================
# 2. Define LSTM Autoencoder
# =====================================
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim=3, hidden_dim=128, latent_dim=64, num_layers=1):
        super(LSTMAutoencoder, self).__init__()

        # Encoder
        self.encoder_lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, latent_dim)

        # Decoder
        self.decoder_lstm = nn.LSTM(latent_dim, hidden_dim, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        # Encoder
        _, (h, _) = self.encoder_lstm(x)  # h: (num_layers, batch, hidden_dim)
        h = h[-1]                         # (batch, hidden_dim)
        z = self.fc(h)                    # (batch, latent_dim)

        # Decoder (repeat latent vector across all timesteps)
        z_repeated = z.unsqueeze(1).repeat(1, x.size(1), 1)  # (batch, T, latent_dim)
        decoded, _ = self.decoder_lstm(z_repeated)
        out = self.output_layer(decoded)   # (batch, T, 3)

        return out, z


# =====================================
# 3. Train Autoencoder
# =====================================
model = LSTMAutoencoder(input_dim=3, hidden_dim=128, latent_dim=64)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Put into batches (for large datasets)
batch_size = 32
num_epochs = 50

dataset = torch.utils.data.TensorDataset(X_tensor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        x_batch = batch[0]
        optimizer.zero_grad()
        reconstructed, _ = model(x_batch)
        loss = criterion(reconstructed, x_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")


# =====================================
# 4. Extract embeddings
# =====================================
with torch.no_grad():
    _, embeddings = model(X_tensor)  # (N, latent_dim)
    embeddings = embeddings.numpy()

print("Embeddings shape:", embeddings.shape)  # (1394, latent_dim)


# =====================================
# 5. Clustering
# =====================================
kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(embeddings)

print("Cluster assignments:", labels[:20])  # first 20 samples


Mounted at /content/drive
Data shape: (1395, 501, 3)
Epoch 1/50, Loss: 174.1758
Epoch 2/50, Loss: 172.2973
Epoch 3/50, Loss: 172.9182
Epoch 4/50, Loss: 173.4011
Epoch 5/50, Loss: 170.6591
Epoch 6/50, Loss: 167.8648
Epoch 7/50, Loss: 167.1109
Epoch 8/50, Loss: 167.0874


KeyboardInterrupt: 

In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras import layers, models

# =====================================
# 1. Load CSV and build tensor
# =====================================
csv_file_path = '/content/drive/MyDrive/Colab Notebooks/Thesis/Imp.csv'
df = pd.read_csv(csv_file_path)  # columns: sample_id, time, var1, var2, var3

# Ensure sorting (important!)
df = df.sort_values(by=["sample_id", "time"])

# Group by sample_id and collect sequences
X_list = []
for sid, group in df.groupby("sample_id"):
    values = group[["var1","var2","var3"]].values  # shape (T, 3)
    X_list.append(values)

X = np.stack(X_list)  # shape (N, T, 3)
print("Data shape:", X.shape)  # e.g. (1394, 100, 3)

timesteps = X.shape[1]
n_features = X.shape[2]
latent_dim = 64

# =====================================
# 2. Define LSTM Autoencoder (Keras)
# =====================================
inputs = layers.Input(shape=(timesteps, n_features))

# Encoder
encoded = layers.LSTM(128, return_sequences=True)(inputs)
encoded = layers.LSTM(latent_dim)(encoded)

# Decoder
decoded = layers.RepeatVector(timesteps)(encoded)
decoded = layers.LSTM(128, return_sequences=True)(decoded)
outputs = layers.TimeDistributed(layers.Dense(n_features))(decoded)

# Build Model
autoencoder = models.Model(inputs, outputs)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()

# =====================================
# 3. Train Autoencoder
# =====================================
history = autoencoder.fit(
    X, X,
    epochs=20,
    batch_size=32,
    shuffle=True,
    validation_split=0.1
)

# =====================================
# 4. Extract embeddings
# =====================================
# Define encoder model separately
encoder = models.Model(inputs, encoded)

embeddings = encoder.predict(X)  # shape (N, latent_dim)
print("Embeddings shape:", embeddings.shape)

# =====================================
# 5. Clustering


Data shape: (1395, 501, 3)


Epoch 1/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 4s/step - loss: 191.6054 - val_loss: 108.7447
Epoch 2/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 4s/step - loss: 177.3209 - val_loss: 108.0229
Epoch 3/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 4s/step - loss: 182.5917 - val_loss: 107.9078
Epoch 4/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 4s/step - loss: 169.0421 - val_loss: 107.6330
Epoch 5/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 4s/step - loss: 178.8479 - val_loss: 107.2402
Epoch 6/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 4s/step - loss: 180.5170 - val_loss: 104.5921
Epoch 7/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 4s/step - loss: 173.8890 - val_loss: 104.8372
Epoch 8/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 4s/step - loss: 172.0069 - val_loss: 104.4592
Epoch 9/20
[1m4

In [None]:
np.set_printoptions(threshold=np.inf)  # disable truncation
print(labels)
print("Embeddings shape:", embeddings.shape)   # should be (1329, latent_dim)
print("First few embeddings:\n", embeddings[:5])
unique, counts = np.unique(labels, return_counts=True)
print(dict(zip(unique, counts)))

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Reduce embeddings (high-dim) → 2D with t-SNE
tsne = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=1000)
embeddings_2d = tsne.fit_transform(embeddings)

# Plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=labels,
    cmap="tab10",
    alpha=0.7
)

plt.title("t-SNE Visualization of Clusters")
plt.xlabel("t-SNE Dim 1")
plt.ylabel("t-SNE Dim 2")
plt.colorbar(scatter, label="Cluster ID")
plt.show()