In [1]:
import json
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load Data from JSON Files
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_line = json.loads(line)
            sent_text = json_line['sentText']
            for relation in json_line['relationMentions']:
                label = relation['label']
                data.append((sent_text, label))
    return pd.DataFrame(data, columns=['sentence', 'label'])

train_df = load_data(r'C:\Users\aryan\Downloads\dataset\train.json')
valid_df = load_data(r'C:\Users\aryan\Downloads\dataset\valid.json')
test_df = load_data(r'C:\Users\aryan\Downloads\dataset\test.json')
train_df

Unnamed: 0,sentence,label
0,Massachusetts ASTON MAGNA Great Barrington ; a...,/location/location/contains
1,North Carolina EASTERN MUSIC FESTIVAL Greensbo...,/location/location/contains
2,It will be the final movie credited to Debra H...,/people/person/place_of_birth
3,In a 3-0 victory over the Crew on Saturday in ...,/location/location/contains
4,"The United States ambassador to Mexico , Tony ...",/location/location/contains
...,...,...
94217,Here we have a 172-acre island with four-star ...,/location/location/contains
94218,One was for St. Francis de Sales Roman Catholi...,/location/neighborhood/neighborhood_of
94219,One was for St. Francis de Sales Roman Catholi...,/location/location/contains
94220,One was for St. Francis de Sales Roman Catholi...,/location/neighborhood/neighborhood_of


In [4]:
train_df.to_csv('train.csv', index= False)

In [3]:
train_df['label'].value_counts()

label
/location/location/contains                           45608
/people/person/nationality                             7198
/location/country/capital                              6718
/people/person/place_lived                             6362
/location/country/administrative_divisions             5658
/location/administrative_division/country              5658
/business/person/company                               4950
/location/neighborhood/neighborhood_of                 4941
/people/person/place_of_birth                          2813
/people/deceased_person/place_of_death                 1752
/business/company/founders                              712
/people/person/children                                 441
/business/company/place_founded                         360
/business/company_shareholder/major_shareholder_of      249
/business/company/major_shareholders                    249
/sports/sports_team/location                            186
/sports/sports_team_location/teams

In [4]:
train_df[train_df['label'].str.contains("business")]['label'].value_counts()

label
/business/person/company                              4950
/business/company/founders                             712
/business/company/place_founded                        360
/business/company_shareholder/major_shareholder_of     249
/business/company/major_shareholders                   249
/business/company/advisors                              39
/business/company/industry                               1
Name: count, dtype: int64

In [5]:
#Change the first letter to an empty string in the label column
train_df['label'] = train_df['label'].str[1:]
test_df['label'] = test_df['label'].str[1:]
valid_df['label'] = valid_df['label'].str[1:]
train_df

Unnamed: 0,sentence,label
0,Massachusetts ASTON MAGNA Great Barrington ; a...,location/location/contains
1,North Carolina EASTERN MUSIC FESTIVAL Greensbo...,location/location/contains
2,It will be the final movie credited to Debra H...,people/person/place_of_birth
3,In a 3-0 victory over the Crew on Saturday in ...,location/location/contains
4,"The United States ambassador to Mexico , Tony ...",location/location/contains
...,...,...
94217,Here we have a 172-acre island with four-star ...,location/location/contains
94218,One was for St. Francis de Sales Roman Catholi...,location/neighborhood/neighborhood_of
94219,One was for St. Francis de Sales Roman Catholi...,location/location/contains
94220,One was for St. Francis de Sales Roman Catholi...,location/neighborhood/neighborhood_of


In [6]:
# Preprocessing
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
label_encoder = {label: i for i, label in enumerate(train_df['label'].unique())}
num_labels = len(label_encoder)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [7]:
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(list(label_encoder.values())),  # Convert list to NumPy array
    y=train_df['label'].map(label_encoder).to_numpy()  # Convert labels to NumPy array
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

In [8]:
# Data Processing Function
def preprocess_data(row):
    encoded_sentence = tokenizer.encode_plus(
        row['sentence'],
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    ).to(device)

    return {
        'input_ids': encoded_sentence['input_ids'].squeeze(),
        'attention_mask': encoded_sentence['attention_mask'].squeeze(),
        'label': torch.tensor(label_encoder[row['label']], dtype=torch.long)
    }

In [9]:
# Convert DataFrames to Processed Data
train_data = train_df.apply(preprocess_data, axis=1).tolist()
val_data = valid_df.apply(preprocess_data, axis=1).tolist()
test_data = test_df.apply(preprocess_data, axis=1).tolist()

In [10]:
# Create Custom Dataset
class RelationshipDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = RelationshipDataset(train_data)
val_dataset = RelationshipDataset(val_data)
test_dataset = RelationshipDataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [11]:
# Model Definition
class RelationshipExtractionModel(torch.nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits

In [12]:
# Initialize Model, Optimizer, and Scheduler
model = RelationshipExtractionModel(num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_training_steps = len(train_dataloader) * 3  # Set to 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Training Function
def train_model(model, train_dataloader, val_dataloader, epochs=3):
    model.train()
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            lr_scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

        val_accuracy = evaluate(model, val_dataloader)
        print(f"Epoch {epoch+1}/{epochs} completed. Validation Accuracy: {val_accuracy:.4f}")


In [14]:
# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    total_preds, total_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)

            total_preds.extend(preds.cpu().tolist())
            total_labels.extend(labels.cpu().tolist())

    return accuracy_score(total_labels, total_preds)

In [15]:
# Train the Model
train_model(model, train_dataloader, val_dataloader,epochs=3)

Epoch 1/3: 100%|██████████| 2945/2945 [3:38:14<00:00,  4.45s/it, loss=1.35]  


Epoch 1/3 completed. Validation Accuracy: 0.5852


Epoch 2/3: 100%|██████████| 2945/2945 [4:08:34<00:00,  5.06s/it, loss=0.839]  


Epoch 2/3 completed. Validation Accuracy: 0.6239


Epoch 3/3: 100%|██████████| 2945/2945 [3:58:28<00:00,  4.86s/it, loss=0.683]  


Epoch 3/3 completed. Validation Accuracy: 0.6373


In [16]:
# Test the Model
test_accuracy = evaluate(model, test_dataloader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.6221


In [17]:
# Prediction Function (Now Takes Only Sentence)
def predict_relationship(sentence):
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    ).to(device)

    input_ids = encoded_sentence['input_ids']
    attention_mask = encoded_sentence['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)

    reverse_label_encoder = {v: k for k, v in label_encoder.items()}
    return reverse_label_encoder[preds.item()]

In [19]:
# Example Predictions
print(f"Predicted Relationship: {predict_relationship('Paris is the capital of France.')}")
print(f"Predicted Relationship: {predict_relationship('Bobby Fischer played chess in Iceland.')}")

Predicted Relationship: location/administrative_division/country
Predicted Relationship: people/person/nationality


In [21]:
# Save the trained model
torch.save(model.state_dict(), "relationship_extraction_model.pth")
print("Model saved as 'relationship_extraction_model.pth'")

Model saved as 'relationship_extraction_model.pth'
