In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from autoencoder import Autoencoder, VariationalAutoencoder
from tqdm import tqdm
import psutil
import os

In [None]:
TEST_SIZE = 10000
CHUNK_SIZE = 1000  # when reading from disk
NUM_WORKERS = os.cpu_count() if os.cpu_count() is not None else 4  # Number of CPU cores

# dataset class
class EmbeddingDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / (1024 ** 3)
    return mem

def get_data(path):
    # --- Read Metadata ---
    
    drop_cols = ['publish_date', 'headline_text']
    tmp_df = pd.read_csv(path, nrows=0)
    all_cols = tmp_df.columns.tolist()
    del tmp_df
    
    use_cols = [col for col in all_cols if col not in drop_cols]
    dtype_dict = {col: 'float32' for col in use_cols}
    del drop_cols, all_cols
    
    print(f'Reading: {path}')
    print('Estimating # of rows...')
    with open(path, 'r', encoding='utf-8') as f:
        total_rows = sum(1 for _ in f) - 1  # Subtract 1 for header
    print(f'Total rows: {total_rows}')



    # --- Read Data ---
    
    data_np = np.empty((total_rows, len(use_cols)), dtype='float32')
    mem_before = get_memory_usage()

    print(f'Memory Before Read: {mem_before:.2f} GB')
    print('Reading and processing csv...')

    with pd.read_csv(path, chunksize=CHUNK_SIZE, usecols=use_cols, dtype=dtype_dict) as reader:
        for i, chunk in enumerate(tqdm(reader, total=(total_rows // CHUNK_SIZE) + 1, desc='processing chunks')):
            start_idx = i * CHUNK_SIZE
            end_idx = start_idx + len(chunk)
            data_np[start_idx:end_idx] = chunk.values
            del chunk


    mem_after = get_memory_usage()
    print(f'Memory After Read: {mem_after:.2f} GB')

    return data_np

In [None]:
input_path = 'abcnews-date-text-embed-4096.csv'

data_np = get_data(input_path)
print(f'Data shape: {data_np.shape}')

np.random.shuffle(data_np)
print('Data shuffled')

train_data_np = data_np[:-TEST_SIZE]
test_data_np = data_np[-TEST_SIZE:]
del data_np  # Free mem

print(f'Training data shape: {train_data_np.shape}')
print(f'Testing data shape: {test_data_np.shape}')

train_tensor = torch.from_numpy(train_data_np)
test_tensor = torch.from_numpy(test_data_np)
del train_data_np
del test_data_np  # Free mem

train_dataset = EmbeddingDataset(train_tensor)
test_dataset = EmbeddingDataset(test_tensor)


In [None]:
print(train_tensor[:10])
assert not torch.isnan(train_tensor).any().item()
assert not torch.isnan(test_tensor).any().item()

Create dataloaders (change batch size here to prevent reading from disk again)

In [None]:
BATCH_SIZE = 1024  # when training

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True
)


In [None]:
# hyperparam
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = 4096
hidden_dim = [2048, 1024, 256, 128]
learning_rate = 0.001
encoding_dim = 10
dropout = 0.2
epochs = 100

# paths
hidden_dim_str = '_'.join(map(str, hidden_dim))
extra_note = '_linear_final'
# extra_note = ''
model_path = f'./models/autoencoder_{input_dim}_{hidden_dim_str}_{encoding_dim}_d{int(dropout * 10.0)}{extra_note}.pth'
stats_path = f'./stats/autoencoder_{input_dim}_{hidden_dim_str}_{encoding_dim}_d{int(dropout * 10.0)}{extra_note}.csv'
output_path = f'./latent/latent_{input_dim}_{hidden_dim_str}_{encoding_dim}_d{int(dropout * 10.0)}{extra_note}.csv'

def train(model, train_dataloader, test_dataloader, crt, opt):
    tr_loss_lst = []
    te_loss_lst = []

    for epoch in range(1, epochs + 1):
        model.train()
        epoch_loss = 0.0
        progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch}/{epochs}', leave=False)
        for train_batch in progress_bar:
            train_batch = train_batch.to(device, non_blocking=True)

            # Forward
            train_outputs = model(train_batch)
            loss = crt(train_outputs, train_batch)

            # Backward
            optimizer.zero_grad()
            loss.backward()
            opt.step()

            epoch_loss += loss.item() * train_batch.size(0)
            progress_bar.set_postfix({'loss': loss.item()})

        model.eval()
        with torch.no_grad():
            test_loss = 0.0
            for test_batch in test_dataloader:
                test_batch = test_batch.to(device, non_blocking=True)
                test_outputs = model(test_batch)
                test_loss += crt(test_outputs, test_batch) * test_batch.size(0)
                
            test_loss /= len(test_dataloader.dataset)
            te_loss_lst.append(test_loss.item())

        train_loss = epoch_loss / len(train_dataloader.dataset)
        tr_loss_lst.append(train_loss)
        print(f'Epoch [{epoch}/{epochs}], Train Loss: {train_loss:.6f}, Test Loss: {test_loss:.6f}')

    return tr_loss_lst, te_loss_lst


In [None]:

print(f'Using device: {device}')

ae = Autoencoder(input_dim, hidden_dim, encoding_dim, dropout=dropout).to(device)
# ae = VariationalAutoencoder(input_dim, hidden_dim, encoding_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=learning_rate)

train_loss_lst, test_loss_lst = train(ae, train_loader, test_loader, criterion, optimizer)



In [None]:
print(len(train_loss_lst))
print(len(test_loss_lst))


In [None]:
# save stats
stats_df = pd.DataFrame({'train_loss': train_loss_lst, 'test_loss': test_loss_lst})
stats_df.to_csv(stats_path, index=False)
print(f'Stats saved to: {stats_path}')

# save model
torch.save(ae.state_dict(), model_path)
print(f'Model saved to: {model_path}')

# save latent
metadata_columns = ['publish_date', 'headline_text']
first_chunk = True

print(f"Estimating total number of rows in {input_path}...")
with open(input_path, 'r', encoding='utf-8') as f:
    total_rows = sum(1 for _ in f) - 1  # Subtract 1 for header
print(f"Total rows to process: {total_rows}")

reader = pd.read_csv(input_path, chunksize=CHUNK_SIZE)

ae.eval()
write_mode = 'w'
for chunk in tqdm(reader, total=(total_rows // CHUNK_SIZE) + 1, desc='Processing Chunks'):
    metadata = chunk[metadata_columns].copy().reset_index(drop=True)
    embeddings = chunk.drop(columns=metadata_columns)
    embeddings_np = embeddings.values.astype('float32')
    embeddings_tensor = torch.from_numpy(embeddings_np).to(device)
    assert not embeddings.isnull().values.any()
    

    with torch.no_grad():
        latent_tensor: torch.tensor = ae.encode(embeddings_tensor)

    assert not torch.isnan(latent_tensor).any().item()
    
    # assert not all zero
    assert not torch.all(latent_tensor == 0).item()
    
    # back to cpu and to numpy
    assert latent_tensor.shape[1] == encoding_dim
    
    latent_np = latent_tensor.cpu().numpy()
    
    assert not np.isnan(latent_np).any()
    
    latent_df = pd.DataFrame(
        latent_np, 
        columns=[str(i) for i in range(1, encoding_dim + 1)]
    ).reset_index(drop=True)
    
    assert not pd.isna(latent_df).to_numpy().any()
    
    output_chunk = pd.concat([metadata, latent_df], axis=1)
    
    assert not pd.isna(output_chunk).to_numpy().any()

    output_chunk.to_csv(
        output_path, 
        index=False, 
        mode=write_mode, 
        header=(write_mode == 'w')
    )
    del output_chunk
    
    write_mode = 'a'

print(f"Latent representations saved to {output_path}")
    


In [None]:
# print parameters of `ae` layer by layer
for name, param in ae.named_parameters():
    print(name, param.size())
    print(param)
    print('\n\n')
    