In [1]:
# First, let's import all necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import docx

# Function to load answers from Word documents
def load_answers_from_docs(doc_files):
    questions = []
    answers = []
    scores = []
    
    for doc_file in doc_files:
        doc = docx.Document(doc_file)
        for paragraph in doc.paragraphs:
            if paragraph.text.startswith("Q"):  # Assume question text starts with "Q"
                questions.append(paragraph.text)
            elif paragraph.text.startswith("A"):  # Assume answer text starts with "A"
                answers.append(paragraph.text[2:].strip())  # Skip "A:"
            elif paragraph.text.startswith("Score"):  # Assume score starts with "Score:"
                scores.append([float(x) for x in paragraph.text.split(":")[1].strip().split(",")])
    
    return answers, scores

# Load the dataset of answers and rubric scores from documents
doc_files = ["answerset1.doc", "answerset2.doc", "answerset3.doc"]  # Add actual file paths
answers, scores = load_answers_from_docs(doc_files)

# Create a word-to-index mapping (for simplicity, just a small set of words for now)
word_to_index = {}
index = 1
for answer in answers:
    for word in answer.split():
        if word.lower() not in word_to_index:
            word_to_index[word.lower()] = index
            index += 1

# Convert answers into integer sequences
def text_to_sequence(text):
    return [word_to_index.get(word.lower(), 0) for word in text.split()]

answers_seq = [text_to_sequence(answer) for answer in answers]

# Padding sequences to ensure consistent length
max_len = max(len(seq) for seq in answers_seq)
answers_pad = [seq + [0] * (max_len - len(seq)) for seq in answers_seq]

# Convert scores to numpy array for model compatibility
scores = np.array(scores)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(answers_pad, scores, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Define a custom Dataset for PyTorch DataLoader
class EssayDataset(Dataset):
    def __init__(self, answers, scores):
        self.answers = answers
        self.scores = scores

    def __len__(self):
        return len(self.answers)

    def __getitem__(self, idx):
        return self.answers[idx], self.scores[idx]

train_dataset = EssayDataset(X_train, y_train)
test_dataset = EssayDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define the model
class EssayGraderModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=64, output_dim=3):
        super(EssayGraderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_dim * 2, 64)  # Bidirectional LSTM, so multiply by 2
        self.fc2 = nn.Linear(64, output_dim)  # 3 output categories for grading rubric
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (ht, ct) = self.lstm(x)
        lstm_out = ht[-1]  # Get the last hidden state from both directions
        x = torch.relu(self.fc1(lstm_out))
        x = self.fc2(x)
        return x

# Initialize the model, loss function, and optimizer
model = EssayGraderModel(vocab_size=len(word_to_index) + 1)  # Plus 1 for padding index
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Evaluate the model
model.eval()
with torch.no_grad():
    y_pred = []
    for inputs, _ in test_loader:
        outputs = model(inputs)
        y_pred.append(outputs)

y_pred = torch.cat(y_pred, dim=0).numpy()

# Here, you can compare the model's predictions (y_pred) with the true labels (y_test)


ModuleNotFoundError: No module named 'torch'