In [4]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/poems-100.csv')  # Update with your file path
text = " ".join(df['text'])  # Assuming 'poems' column contains text

# Tokenization
words = text.split()
vocab = sorted(set(words))
vocab_size = len(vocab)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

# Convert words to One-Hot Encoding
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(np.array(vocab).reshape(-1, 1))

def one_hot_encode(word):
    return ohe.transform(np.array(word).reshape(-1, 1))

def one_hot_decode(vec):
    return ohe.inverse_transform(vec.reshape(1, -1))[0][0]

# Convert text into sequences
sequence_length = 5  # Context size
X, Y = [], []
for i in range(len(words) - sequence_length):
    X.append(words[i:i+sequence_length])
    Y.append(words[i+sequence_length])

# Convert to tensor
X_ohe = torch.tensor([one_hot_encode(seq).flatten() for seq in X], dtype=torch.float32)
Y_ohe = torch.tensor([word_to_idx[word] for word in Y], dtype=torch.long)

# Define RNN Model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x.view(x.size(0), 1, -1))
        out = self.fc(out[:, -1, :])
        return out

# Train One-Hot Model
hidden_size = 128
model_ohe = RNNModel(vocab_size, hidden_size, vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_ohe.parameters(), lr=0.01)

for epoch in range(100):
    optimizer.zero_grad()
    output = model_ohe(X_ohe)
    loss = criterion(output, Y_ohe)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Convert text into Embedding format
embedding_dim = 50
X_embed = torch.tensor([[word_to_idx[word] for word in seq] for seq in X], dtype=torch.long)
Y_embed = torch.tensor([word_to_idx[word] for word in Y], dtype=torch.long)

# Define LSTM Model with Trainable Embeddings
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

# Train Embedding Model
model_embed = LSTMModel(vocab_size, embedding_dim, hidden_size, vocab_size)
optimizer = optim.Adam(model_embed.parameters(), lr=0.01)

for epoch in range(100):
    optimizer.zero_grad()
    output = model_embed(X_embed)
    loss = criterion(output, Y_embed)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Generate Text from Model
def generate_text(model, start_text, length=20):
    model.eval()
    words = start_text.split()
    for _ in range(length):
        seq = words[-sequence_length:]
        seq_encoded = torch.tensor([word_to_idx[word] for word in seq], dtype=torch.long).unsqueeze(0)
        with torch.no_grad():
            output = model(seq_encoded)
            next_word_idx = torch.argmax(output, dim=1).item()
            words.append(idx_to_word[next_word_idx])
    return " ".join(words)

# Example usage
start_text = "Once upon a time"
print("Generated Text (OHE Model):", generate_text(model_ohe, start_text))
print("Generated Text (Embedding Model):", generate_text(model_embed, start_text))

print("Training completed! Now comparing results...")


Mounted at /content/drive
Using device: cuda


  X_ohe = torch.tensor([one_hot_encode(seq).flatten() for seq in X], dtype=torch.float32)


RuntimeError: input.size(-1) must be equal to input_size. Expected 7459, got 37295