In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Create a DataFrame
df = pd.read_csv('dataset/links.csv')
# Encode categorical data
label_encoder_imdb = LabelEncoder()
label_encoder_tmdb = LabelEncoder()

df['imdbId'] = label_encoder_imdb.fit_transform(df['imdbId'])
df['tmdbId'] = label_encoder_tmdb.fit_transform(df['tmdbId'])

# Prepare features and labels
X = df[['movieId', 'imdbId', 'tmdbId']].values
num_classes = max(X[:, 1]) + 1  # for imdbId
num_classes_tmdb = max(X[:, 2]) + 1  # for tmdbId

# Split the dataset
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Define the generator model
def build_generator(latent_dim, num_classes):
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=latent_dim))
    model.add(layers.Dense(num_classes, activation='softmax'))  # For imdbId
    model.add(layers.Dense(num_classes_tmdb, activation='softmax'))  # For tmdbId
    return model

# Define the discriminator model
def build_discriminator(num_classes):
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(num_classes + num_classes_tmdb,)))
    model.add(layers.Dense(1, activation='sigmoid'))  # Binary output
    return model

# Set parameters
latent_dim = 10
epochs = 10000
batch_size = 8

# Create models
generator = build_generator(latent_dim, num_classes)
discriminator = build_discriminator(num_classes + num_classes_tmdb)

# Compile the discriminator
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
discriminator.trainable = False  # Freeze the discriminator when training the generator

# Connect the generator and discriminator
gan_input = layers.Input(shape=(latent_dim,))
generated_data = generator(gan_input)

# Flatten generated data to feed into the discriminator
flattened_data = layers.Flatten()(generated_data)
gan_output = discriminator(flattened_data)

gan = tf.keras.Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Training function
def train_gan(gan, generator, discriminator, X_train, epochs, batch_size):
    for epoch in range(epochs):
        # Generate random noise
        noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
        
        # Generate new data
        generated_data = generator.predict(noise)

        # Get a random set of real data
        real_data = X_train[np.random.randint(0, X_train.shape[0], size=batch_size)]
        
        # Combine real and fake data
        combined_data = np.concatenate([real_data, generated_data])
        
        # Create labels for real (1) and fake (0) data
        labels = np.array([1] * batch_size + [0] * batch_size)

        # Train the discriminator
        d_loss = discriminator.train_on_batch(combined_data, labels)

        # Train the generator
        noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
        valid_labels = np.array([1] * batch_size)  # Labels for the generator to "fool" the discriminator
        
        g_loss = gan.train_on_batch(noise, valid_labels)

        # Print the progress
        if epoch % 1000 == 0:
            print(f'Epoch: {epoch}, D Loss: {d_loss[0]}, G Loss: {g_loss}')

# Train the GAN
train_gan(gan, generator, discriminator, X_train, epochs, batch_size)

# Generate new samples
num_samples = 5  # Number of samples to generate
noise = np.random.normal(0, 1, size=(num_samples, latent_dim))
generated_samples = generator.predict(noise)

# Convert generated samples back to original format
imdb_ids_generated = np.argmax(generated_samples[:, :num_classes], axis=1)
tmdb_ids_generated = np.argmax(generated_samples[:, num_classes:], axis=1)

# Map back to original ids
imdb_ids_original = label_encoder_imdb.inverse_transform(imdb_ids_generated)
tmdb_ids_original = label_encoder_tmdb.inverse_transform(tmdb_ids_generated)

# Display generated samples
generated_df = pd.DataFrame({
    'movieId': range(len(imdb_ids_original)),
    'imdbId': imdb_ids_original,
    'tmdbId': tmdb_ids_original
})

print("Generated Samples:")
print(generated_df)


ResourceExhaustedError: {{function_node __wrapped__StatelessRandomUniformV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[45843,45595] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessRandomUniformV2]