In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader, Batch
from torch_geometric.nn import GCNConv

# Ensure you have downloaded the required NLTK data
nltk.download('punkt')

# Load data from Excel file
try:
    data = pd.read_excel(r'C:\Users\ALVIN\Downloads\Data Chatbot STKI.xlsx')
except FileNotFoundError:
    print("File not found!")
    exit()

# Split data into questions and answers
questions = data['QUESTION'].tolist()
answers = data['ANSWER'].tolist()

# Tokenize questions and answers
tokens_questions = [word_tokenize(q.lower()) for q in questions]
tokens_answers = [word_tokenize(a.lower()) for a in answers]

# Create a vocabulary from the dataset
vocab = set()
for q in tokens_questions:
    vocab.update(q)
for a in tokens_answers:
    vocab.update(a)
vocab = {word: idx for idx, word in enumerate(vocab)}

# Create a map from tokenized answers to original answers
tokenized_to_original_answer = {" ".join(word_tokenize(a.lower())): a for a in answers}

# Define a function to create graph data for a single question-answer pair
def create_graph_data(question, answer, vocab):
    edge_index = []
    x = []
    y = []

    question_nodes = list(range(len(x), len(x) + len(question)))
    answer_nodes = list(range(len(x) + len(question), len(x) + len(question) + len(answer)))

    # Create one-hot encoded features for nodes
    for w in question:
        if w in vocab:
            x.append(torch.eye(len(vocab))[vocab[w]])
    for w in answer:
        if w in vocab:
            x.append(torch.eye(len(vocab))[vocab[w]])

    # Create edges for question nodes
    for i in range(len(question_nodes) - 1):
        edge_index.append([question_nodes[i], question_nodes[i + 1]])
        edge_index.append([question_nodes[i + 1], question_nodes[i]])

    # Create edges for answer nodes
    for i in range(len(answer_nodes) - 1):
        edge_index.append([answer_nodes[i], answer_nodes[i + 1]])
        edge_index.append([answer_nodes[i + 1], answer_nodes[i]])

    # Connect the last question node to the first answer node
    if question_nodes and answer_nodes:
        edge_index.append([question_nodes[-1], answer_nodes[0]])
        edge_index.append([answer_nodes[0], question_nodes[-1]])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    if x:  # Ensure x is not empty
        x = torch.stack(x)
    else:
        x = torch.empty((0, len(vocab)))  # Handle case when x is empty

    if answer:  # Check if answer is available
        original_answer = tokenized_to_original_answer.get(" ".join(answer), "")
        y = torch.tensor([answers.index(original_answer)] if original_answer else [-1])  # Use the index of the answer as the label
    else:
        original_answer = ""
        y = torch.tensor([-1])  # Set label to -1 if answer is not available

    return Data(x=x, edge_index=edge_index, y=y)

# Create graph data for all question-answer pairs
graph_data_list = [create_graph_data(q, a, vocab) for q, a in zip(tokens_questions, tokens_answers)]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ALVIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import DataLoader

# Define a basic GNN model
class BasicGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BasicGNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Apply linear transformation and ReLU activation
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)

        # Take the mean of node features for graph-level output
        x = torch.mean(x, dim=0)
        return x

# Parameters
input_dim = len(vocab)
hidden_dim = 64
output_dim = len(answers)

# Create the model, loss function, and optimizer
model = BasicGNN(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Create a DataLoader
loader = DataLoader(graph_data_list, batch_size=1, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.unsqueeze(0), data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(loader)}')

print("Training complete.")




Epoch 1/10, Loss: 4.067059984573951
Epoch 2/10, Loss: 3.983965658224546
Epoch 3/10, Loss: 3.9730625381836524
Epoch 4/10, Loss: 3.918081132265238
Epoch 5/10, Loss: 3.799857932787675
Epoch 6/10, Loss: 3.4244254048054037
Epoch 7/10, Loss: 2.9509711746986094
Epoch 8/10, Loss: 2.292486616052114
Epoch 9/10, Loss: 1.6367898617799466
Epoch 10/10, Loss: 1.0331694674319947
Training complete.


In [3]:
import torch
from nltk.tokenize import word_tokenize
from torch_geometric.data import Data

# Define a function to preprocess and create graph data for a new question
def preprocess_question(question, vocab):
    tokens_question = word_tokenize(question.lower())
    question_nodes = list(range(len(tokens_question)))
    
    # Create one-hot encoded features for nodes
    x = []
    for w in tokens_question:
        if w in vocab:
            x.append(torch.eye(len(vocab))[vocab[w]])

    # Create edges for question nodes
    edge_index = []
    for i in range(len(question_nodes) - 1):
        edge_index.append([question_nodes[i], question_nodes[i + 1]])
        edge_index.append([question_nodes[i + 1], question_nodes[i]])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    if x:  # Ensure x is not empty
        x = torch.stack(x)
    else:
        x = torch.empty((0, len(vocab)))  # Handle case when x is empty

    return Data(x=x, edge_index=edge_index)

# Function to predict the answer for a given question
def predict_answer(question, model, vocab, answers):
    model.eval()
    with torch.no_grad():
        graph_data = preprocess_question(question, vocab)
        output = model(graph_data)
        predicted_index = torch.argmax(output).item()
        predicted_answer = answers[predicted_index]
    return predicted_answer

# Example of using the predict function
new_question = "maryamah"
predicted_answer = predict_answer(new_question, model, vocab, answers)
print(f"Question: {new_question}")
print(f"Predicted Answer: {predicted_answer}")

Question: maryamah
Predicted Answer: Maryamah is a lecturer and researcher at Data Science Technology, Department of Engineering, Faculty of Advanced Technology and Multidiscipline, Universitas Airlangga.\nShe received a Bachelor degree from Informatics Engineering, Brawijaya University. Then, Master degree from Institut Teknologi Sepuluh Nopember (Surabaya, Indonesia), majors in Informatics Engineering in 2018. Futhermore, she received a Doctoral degree from Institut Teknologi Sepuluh Nopember with major in Computer Science degree. Her research interests are in Natural Language Processing, Information Retrieval, Big Data, Data Science, Artificial Intelligence, and Optimization. Her current projects related to Natural Language Processing and Information Retrieval.\nNIK : 199507012022103201\nNama : Dr. Maryamah, S.Kom.\n\nPendidikan : S3 Ilmu Komputer, Institut Teknologi Sepuluh Nopember\nResearch Interest : Natural Language Processing, Information Retrieval, Big Data, Data Science, Art