# Autoencoder
- Auto = self
- encode = convert into a different form
- Autoencoder = a system that teaches itself how to encode information
- Number outputs correspond to the entry from the model

# Structure of the layers
- Encode: input > layers > bottleneck or latent code (central node)

#  Goal of autoencoder:
- Get the output to match the input closer possible
- data compression of dimension reduction
- data cleaning (denoising, despeckling, occlusion)  
- feature extraction
- anomaly / fraud detection
- pretraining deep or complex models   


In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()


In [3]:
ratings = pd.read_csv('../dataset/ml-latest-small/ratings.csv', sep=',')

In [4]:
ratings['userId'] = le.fit_transform(ratings['userId'])
ratings['movieId'] = le.fit_transform(ratings['movieId'])

In [5]:
user_ids = ratings['userId'].unique()
item_ids = ratings['movieId'].unique()
user_id_map = {user_id: i for i, user_id in enumerate(user_ids)}
item_id_map = {item_id: i for i, item_id in enumerate(item_ids)}

In [6]:
# Update user and item IDs in the dataframe
ratings['userId'] = ratings['userId'].map(user_id_map)
ratings['movieId'] = ratings['movieId'].map(item_id_map)

In [7]:
# Define PyTorch Dataset
class MovieLensDataset(Dataset):
    def __init__(self, ratings):
        self.ratings = ratings.values.astype(np.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.ratings[idx]

# Create DataLoader
batch_size = 64
dataset = MovieLensDataset(ratings)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
# Define autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Parameters
input_dim = len(user_id_map) + len(item_id_map)  # Total number of users + total number of items
encoding_dim = 64  # Dimensionality of the latent space
num_epochs = 10
learning_rate = 0.001

# Initialize model, loss function, and optimizer
model = Autoencoder(input_dim, encoding_dim)
loss_fun = nn.MSELoss()  # Mean Squared Error loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in dataloader:
        # Flatten input data
        print (batch.shape)
        inputs = batch.view(-1, input_dim)

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = loss_fun(outputs, inputs)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print epoch loss
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}')

# Save the trained model
torch.save(model.state_dict(), 'autoencoder_model.pth')