In [3]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from torch.utils.data import DataLoader, Dataset
import requests
from bs4 import BeautifulSoup as bs

In [5]:
def parse_url_MICROSOFT(url):
    response = requests.get(url) 
    soup = bs(response.content, "lxml") 
    article = soup.find(name="article", class_ = "ocpArticleContent")

    for script in article(["script", "style"]):
            script.extract()


        # Get textual content
    text = article.get_text(separator=' ')
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)

    if not text:
        return f" <h3> Article not found with the url:{url} <h3>"
    return text.prettify() 

# Step 1: Extract Text from HTML Pages
def extract_text_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = bs(response.text, 'html.parser')

        # Remove script and style content
        for script in soup(["script", "style"]):
            script.extract()

        # Get textual content
        text = soup.get_text(separator=' ')
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)

        return text
    return None



# Step 2: Dataset Preparation
class QADataset(Dataset):
    def __init__(self, questions, urls, tokenizer):
        self.questions = questions
        self.urls = urls
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        html_text = parse_url_MICROSOFT(self.urls[idx])
        # Tokenize question and HTML content
        inputs = self.tokenizer(question, html_text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
        
        # In a real scenario, you would preprocess to get answer start/end positions
        start_positions = torch.tensor(0)  # Placeholder
        end_positions = torch.tensor(10)   # Placeholder
        
        return inputs, start_positions, end_positions

# Step 3: Fine-tune the Model
def train(model, dataloader, optimizer):
    model.train()
    for batch in dataloader:
        inputs, start_positions, end_positions = batch
        
        # Move tensors to the correct device
        input_ids = inputs['input_ids'].squeeze().to(device)
        attention_mask = inputs['attention_mask'].squeeze().to(device)
        token_type_ids = inputs['token_type_ids'].squeeze().to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Loss: {loss.item()}")

# Step 4: Set up the Data and Train
def main():
    # Sample questions and URLs (you can expand this)
    questions = ["Cannot connect to Wi-Fi", "Err Network Access Denied Error"]
    urls = ["https://support.microsoft.com/en-us/help/10741/windows-fix-network-connection-issues", "https://support.microsoft.com/en-us/topic/-access-denied-or-other-errors-when-you-access-or-work-with-files-and-folders-in-windows-219af563-1953-ab4a-f17e-b0182755214e#:~:text=Cause%20An%20%22Access%20Denied%22%20error "]

    # Load pre-trained tokenizer and model (BERT or HTML-aware model)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

    # Prepare dataset and dataloader
    dataset = QADataset(questions, urls, tokenizer)
    dataloader = DataLoader(dataset, batch_size=1)

    # Move model to device (GPU/CPU)
    global device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Set up optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Train the model
    train(model, dataloader, optimizer)

if __name__ == "__main__":
    main()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'str' object has no attribute 'prettify'

cuda
