In [None]:
import pandas as pd

# Specify the path to the JSON file
file_path = './dataset/train.json'

# Read the JSON file into a DataFrame
df = pd.read_json(file_path)

# Preview the first few rows of the DataFrame
df.transpose()
df.head()

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

def preprocess_dataset_for_bert(df):
    # Concatenate 'Title' and 'Body' columns to add context
    df['Context'] = df['Title'] + ' ' + df['Body']
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    df['Comment'] = df['Comment'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))
    
    # Tokenize text
    df['Context'] = df['Context'].apply(word_tokenize)
    df['Comment'] = df['Comment'].apply(word_tokenize)
    
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    df['Context'] = df['Context'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    df['Comment'] = df['Comment'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Perform spell checking and correction
    spell = SpellChecker()
    df['Context'] = df['Context'].apply(lambda x: [spell.correction(word) for word in x])
    df['Comment'] = df['Comment'].apply(lambda x: [spell.correction(word) for word in x])
    
    # Convert tokenized text back to string
    df['Context'] = df['Context'].apply(' '.join)
    df['Comment'] = df['Comment'].apply(' '.join)
    
    return df


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def train_bert_model(df):
    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    
    # Encode the 'Context' and 'Comment' columns using BERT tokenizer
    encoded_data = tokenizer.batch_encode_plus(df[['Context', 'Comment']].values,
                                                add_special_tokens=True,
                                                return_attention_mask=True,
                                                return_tensors='pt')
    
    # Split the dataset into train and test sets
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = df['L1: Type'].values
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids,
                                                                            labels,
                                                                            test_size=0.2,
                                                                            random_state=42)
    train_masks, test_masks, _, _ = train_test_split(attention_masks,
                                                     input_ids,
                                                     test_size=0.2,
                                                     random_state=42)
    
    # Convert data to PyTorch tensors
    train_inputs = train_inputs.to('cuda')
    train_labels = torch.tensor(train_labels).to('cuda')
    train_masks = train_masks.to('cuda')
    
    test_inputs = test_inputs.to('cuda')
    test_labels = torch.tensor(test_labels).to('cuda')
    test_masks = test_masks.to('cuda')
    
    # Fine-tune BERT model on training data
    model.to('cuda')
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    epochs = 3
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(train_inputs,
                        token_type_ids=None,
                        attention_mask=train_masks,
                        labels=train_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    # Evaluate BERT model on test data
    model.eval()
    test_outputs = model(test_inputs,
                          token_type_ids=None,
                          attention_mask=test_masks,
                          labels=test_labels)
    _, preds = torch.max(test_outputs.logits, dim=1)
    preds = preds.cpu().numpy()
    test_labels = test_labels.cpu().numpy()
    report = classification_report(test_labels, preds)
    
    return model, report

In [None]:
# Load the preprocessed dataset into a DataFrame (df)

# Perform necessary data preprocessing steps (text normalization, stopword removal, tokenization, etc.) on the dataset

# Call the train_bert_model() function
trained_model, classification_report = train_bert_model(df)

# Print the classification report
print("Classification Report:\n", classification_report)


In [None]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def train_albert_model(df):
    # Load pre-trained ALBERT tokenizer and model
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
    
    # Encode the 'Context' and 'Comment' columns using ALBERT tokenizer
    encoded_data = tokenizer.batch_encode_plus(df[['Context', 'Comment']].values,
                                                add_special_tokens=True,
                                                return_attention_mask=True,
                                                return_tensors='pt')
    
    # Split the dataset into train and test sets
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = df['L1: Type'].values
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids,
                                                                            labels,
                                                                            test_size=0.2,
                                                                            random_state=42)
    train_masks, test_masks, _, _ = train_test_split(attention_masks,
                                                     input_ids,
                                                     test_size=0.2,
                                                     random_state=42)
    
    # Convert data to PyTorch tensors
    train_inputs = train_inputs.to('cuda')
    train_labels = torch.tensor(train_labels).to('cuda')
    train_masks = train_masks.to('cuda')
    
    test_inputs = test_inputs.to('cuda')
    test_labels = torch.tensor(test_labels).to('cuda')
    test_masks = test_masks.to('cuda')
    
    # Fine-tune ALBERT model on training data
    model.to('cuda')
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    epochs = 3
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(train_inputs,
                        token_type_ids=None,
                        attention_mask=train_masks,
                        labels=train_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    # Evaluate ALBERT model on test data
    model.eval()
    test_outputs = model(test_inputs,
                          token_type_ids=None,
                          attention_mask=test_masks,
                          labels=test_labels)
    _, preds = torch.max(test_outputs.logits, dim=1)
    preds = preds.cpu().numpy()
    test_labels = test_labels.cpu().numpy()
    report = classification_report(test_labels, preds)
    
    return model, report


In [None]:
# Load the preprocessed dataset into a DataFrame (df)

# Perform necessary data preprocessing steps (text normalization, stopword removal, tokenization, etc.) on the dataset

# Call the train_bert_model() function
trained_model, classification_report = train_albert_model(df)

# Print the classification report
print("Classification Report:\n", classification_report)

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def train_roberta_model(df):
    # Load pre-trained RoBERTa tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')
    
    # Encode the 'Context' and 'Comment' columns using RoBERTa tokenizer
    encoded_data = tokenizer.batch_encode_plus(df[['Context', 'Comment']].values,
                                                add_special_tokens=True,
                                                return_attention_mask=True,
                                                return_tensors='pt')
    
    # Split the dataset into train and test sets
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = df['L1: Type'].values
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids,
                                                                            labels,
                                                                            test_size=0.2,
                                                                            random_state=42)
    train_masks, test_masks, _, _ = train_test_split(attention_masks,
                                                     input_ids,
                                                     test_size=0.2,
                                                     random_state=42)
    
    # Convert data to PyTorch tensors
    train_inputs = train_inputs.to('cuda')
    train_labels = torch.tensor(train_labels).to('cuda')
    train_masks = train_masks.to('cuda')
    
    test_inputs = test_inputs.to('cuda')
    test_labels = torch.tensor(test_labels).to('cuda')
    test_masks = test_masks.to('cuda')
    
    # Fine-tune RoBERTa model on training data
    model.to('cuda')
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    epochs = 3
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(train_inputs,
                        token_type_ids=None,
                        attention_mask=train_masks,
                        labels=train_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    # Evaluate RoBERTa model on test data
    model.eval()
    test_outputs = model(test_inputs,
                          token_type_ids=None,
                          attention_mask=test_masks,
                          labels=test_labels)
    _, preds = torch.max(test_outputs.logits, dim=1)
    preds = preds.cpu().numpy()
    test_labels = test_labels.cpu().numpy()
    report = classification_report(test_labels, preds)
    
    return model, report


In [None]:
# Load the preprocessed dataset into a DataFrame (df)

# Perform necessary data preprocessing steps (text normalization, stopword removal, tokenization, etc.) on the dataset

# Call the train_bert_model() function
trained_model, classification_report = train_roberta_model(df)

# Print the classification report
print("Classification Report:\n", classification_report)