In [5]:
print("hello World")

hello World


In [17]:
import os
import docx
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
import numpy as np

Size = 10
# Function to load answers from Word documents
def load_answers_from_docs(doc_files):
    questions = []
    answers = []
    scores = []
    
    for doc_file in doc_files:
        doc = docx.Document(doc_file)
        for paragraph in doc.paragraphs:
            if paragraph.text.startswith("Q"):  # Assume question text starts with "Q"
                questions.append(paragraph.text)
            elif paragraph.text.startswith("A"):  # Assume answer text starts with "A"
                answers.append(paragraph.text[2:].strip())  # Skip "A:"
            elif paragraph.text.startswith("Score"):  # Assume score starts with "Score:"
                scores.append([float(x) for x in paragraph.text.split(":")[1].strip().split(",")])
    
    return questions, answers, scores

# Load dataset
doc_files = ["Paper1.docx", "Paper2.docx", "Paper3.docx", "Paper4.docx", "Paper5.docx", "Paper6.docx", "Paper7.docx", "Paper8.docx", "Paper9.docx", "Paper10.docx"]
questions, answers, scores = load_answers_from_docs(doc_files)

# Preprocess text: Tokenization and padding
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(answers)
answers_seq = tokenizer.texts_to_sequences(answers)
answers_pad = tf.keras.preprocessing.sequence.pad_sequences(answers_seq, maxlen=500)

# Convert scores to NumPy array for model compatibility
max_categories = 4
padded_scores = []

for score in scores:
    # Pad with zeros or NaNs if the score list is shorter than max_categories
    if len(score) < max_categories:
        score += [0] * (max_categories - len(score))  # Padding with zeros
    padded_scores.append(score)

# Convert to numpy array
scores = np.array(padded_scores)
print("Scores shape:", scores.shape)
print("Scores shape:", scores.shape)
print("Answers shape:", answers_pad.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(answers_pad, scores, test_size=0.2, random_state=42)

# Define the model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=500),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    LSTM(32),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')  # Output matches the rubric categories
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Save the model
model.save("essay_grading_model.h5")

print("Model trained and saved successfully!")


Scores shape: (200, 4)
Scores shape: (200, 4)
Answers shape: (200, 500)
Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 449ms/step - loss: 7.4372 - mae: 2.4714 - val_loss: 6.7600 - val_mae: 2.3253
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 301ms/step - loss: 6.1578 - mae: 2.2006 - val_loss: 3.8461 - val_mae: 1.5997
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 307ms/step - loss: 2.8101 - mae: 1.3626 - val_loss: 1.6010 - val_mae: 1.0395
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 297ms/step - loss: 1.5662 - mae: 1.0294 - val_loss: 1.5399 - val_mae: 1.0296
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 303ms/step - loss: 1.3640 - mae: 0.9999 - val_loss: 1.4286 - val_mae: 1.0447
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 299ms/step - loss: 1.4156 - mae: 1.0264 - val_loss: 1.3936 - val_mae: 1.0423
Epoch 7/10
[1m5/5[0m [32m



Model trained and saved successfully!
