In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
# Load Data
def load_data(file_path):
    data = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_line = json.loads(line)
            sent_text = json_line['sentText']
            for relation in json_line['relationMentions']:
                label = relation['label']
                data.append(sent_text)
                labels.append(label)
    return data, labels

# Load datasets
train_texts, train_labels = load_data(r'C:\Users\aryan\Downloads\dataset\train.json')
valid_texts, valid_labels = load_data(r'C:\Users\aryan\Downloads\dataset\valid.json')
test_texts, test_labels = load_data(r'C:\Users\aryan\Downloads\dataset\test.json')

In [3]:
# Convert labels to numerical format
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
valid_labels = label_encoder.transform(valid_labels)
test_labels = label_encoder.transform(test_labels)

num_classes = len(label_encoder.classes_)

In [4]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words
X_train = vectorizer.fit_transform(train_texts).toarray()
X_valid = vectorizer.transform(valid_texts).toarray()
X_test = vectorizer.transform(test_texts).toarray()

In [5]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

y_train_tensor = torch.tensor(train_labels, dtype=torch.long).to(device)
y_valid_tensor = torch.tensor(valid_labels, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(test_labels, dtype=torch.long).to(device)

In [6]:
# Create PyTorch Dataset
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [7]:
# Create DataLoaders
batch_size = 32
train_loader = DataLoader(TextDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(TextDataset(X_valid_tensor, y_valid_tensor), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TextDataset(X_test_tensor, y_test_tensor), batch_size=batch_size, shuffle=False)

In [8]:
# Define the SVM Model
class SVM(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SVM, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)  # Linear classifier

    def forward(self, x):
        return self.fc(x)  # No activation (SVM uses raw scores)

In [9]:
# Initialize model
model = SVM(X_train.shape[1], num_classes).to(device)

In [10]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  # Using cross-entropy loss (softmax approx. for hinge loss)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

In [11]:
# Training Function
def train_model(model, train_loader, valid_loader, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation accuracy
        val_accuracy = evaluate(model, valid_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


In [12]:
# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    return accuracy_score(all_labels, all_preds)

In [13]:
# Train the model
train_model(model, train_loader, valid_loader, epochs=10)

Epoch 1/10, Loss: 6538.7847, Validation Accuracy: 0.4750
Epoch 2/10, Loss: 5784.6954, Validation Accuracy: 0.4750
Epoch 3/10, Loss: 5735.5888, Validation Accuracy: 0.4750
Epoch 4/10, Loss: 5716.9957, Validation Accuracy: 0.4750
Epoch 5/10, Loss: 5709.7772, Validation Accuracy: 0.4750
Epoch 6/10, Loss: 5708.3104, Validation Accuracy: 0.4750
Epoch 7/10, Loss: 5707.1401, Validation Accuracy: 0.4750
Epoch 8/10, Loss: 5706.3828, Validation Accuracy: 0.4750
Epoch 9/10, Loss: 5706.4182, Validation Accuracy: 0.4750
Epoch 10/10, Loss: 5706.3988, Validation Accuracy: 0.4750


In [14]:
# Test the model
test_accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.4711


In [15]:
# Save the model
torch.save(model.state_dict(), "svm_text_classifier.pth")

In [16]:

# Function to predict new text
def predict(text):
    model.eval()
    with torch.no_grad():
        encoded_text = vectorizer.transform([text]).toarray()
        tensor_text = torch.tensor(encoded_text, dtype=torch.float32).to(device)
        output = model(tensor_text)
        _, pred = torch.max(output, dim=1)
        return label_encoder.inverse_transform([pred.cpu().item()])[0]

# Example prediction
print(f"Predicted Relationship: {predict('Barack Obama was the president of the USA.')}")

Predicted Relationship: /location/location/contains


In [17]:
import pickle

# Save the vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# Save the label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)