# **Import the dependencies and setup API keys**

In [None]:

!pip install python-dotenv
!pip install numpy
!pip install pandas
!pip install firecrawl-py
!pip install jsonlines

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting firecrawl-py
  Downloading firecrawl_py-1.10.2-py3-none-any.whl.metadata (10 kB)
Downloading firecrawl_py-1.10.2-py3-none-any.whl (18 kB)
Installing collected packages: firecrawl-py
Successfully installed firecrawl-py-1.10.2
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd
import json
import firecrawl
from firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import Any, Optional, List


In [None]:
from google.colab import userdata
api_key=userdata.get('FIRECRAWL_API_KEY')

# **Selecting some high quality data source**


1.  https://www.geeksforgeeks.org/best-html-coding-practices-you-must-know/
2. https://medium.com/@wewillcode/top-12-common-errors-that-you-encounter-while-using-html-f2909e3b1d48




# **Scraping and formatting the data from sites to json**

In [None]:
# Install with pip install firecrawl-py
from firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import Any, Optional, List

app = FirecrawlApp(api_key=api_key)

class NestedModel1(BaseModel):
    sub_heading: str
    content: str

class ExtractSchema(BaseModel):
    table: list[NestedModel1]

data = app.extract([
  "https://geeksforgeeks.org/best-html-coding-practices-you-must-know",
  # "https://medium.com/@wewillcode/top-12-common-errors-that-you-encounter-while-using-html-f2909e3b1d48"
], {
    'prompt': 'Extract sub headings and their corresponding content. Present the data in a tabular format below the "12 Best HTML Coding Practices You Must Know" section upto "Conclusion"',
    'schema': ExtractSchema.model_json_schema(),
})

# **Data Exploration**

In [None]:
df=pd.read_csv("/content/gfg_data.csv")
df.head()

Unnamed: 0,content,example,practice
0,HTML has a nature that will still render your ...,HTML\n<!DOCTYPE html>\n<html>\n<head>\n <ti...,Use Proper Document Structure With Doctype
1,To avoid validation and compatibility issues d...,HTML\n<div>\n <div>\n <div>\n ...,Close the Tags
2,Make a habit of using lowercase for all the ta...,HTML\n<!-- Wrong practice-->\n<SECTION>\n<p>Th...,Write Tags in Lowercase
3,When you add an image to your HTML code don’t ...,"HTML\n<!-- Wrong Practice-->\n<img src=""html5....",Add Image Attributes
4,A lot of newbies make the mistake they adding ...,"HTML\n<!-- Wrong Practice -->\n<p style=""color...",Avoid Using Inline Styles


In [None]:
df['practice'][0]

'Use Proper Document Structure With Doctype'

In [None]:
df['example'][0]

'HTML\n<!DOCTYPE html>\n<html>\n<head>\n    <title>Hello World</title>\n</head>\n<body>\n    <h1>Welcome Programmers</h1>\n<p>This website is GeeksforGeeks.</p>\n</body>\n</html>'

In [None]:
df['content'][0]

'HTML has a nature that will still render your markup correctly even if you forget to mention some elements such as <html>, <head>, <body>, and <!DOCTYPE html>. You will see the correct result in your browser as you want but that doesn’t mean you will find the same result in every browser. To avoid this issue it’s a good habit to follow a proper document structure with the correct doctype.'

In [None]:
import jsonlines

with jsonlines.open('/content/gfg_data.jsonl') as reader:
    for obj in reader:

        print(obj['instruction'])

Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:
Detect errors in this HTML and suggest corrections:


In [None]:
!pip install torch transformers datasets peft accelerate bitsandbytes


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from 

In [1]:
import json
import re
import time
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

# Enhanced Training Data
data = [
    # Original examples
    {"bad_code": "<div>\n    <div>\n        <div>\n            <p>Hello Programmers</p>\n            <ul>\n                <li>Array<li>\n                <li>Linked List<li>\n                <1i>Stack<li>\n           </ul>\n       </div>\n    </div>\n</div>",
     "good_code": "<div>\n    <div>\n        <div>\n            <p>Hello Programmers</p>\n            <ul>\n                <li>Array</li>\n                <li>Linked List</li>\n                <li>Stack</li>\n           </ul>\n       </div>\n    </div>\n</div>"},
    {"bad_code": "<SECTION>\n<p>This is a paragraph.</p>\n</SECTION>",
     "good_code": "<section>\n<p>This is a paragraph.</p>\n</section>"},
    {"bad_code": "<img src=\"html5.gif\">",
     "good_code": "<img src=\"html5.gif\" alt=\"HTML5\" style=\"width:100px;height:100px;\">"},
    {"bad_code": "<p style=\"color: #393; font-size: 24px;\">Thank you!</p>",
     "good_code": "<p class=\"alert-success\">Thank you!</p>"},
    {"bad_code": "<span class=\"heading\"><strong>Hello Geeks</strong></span><br><br>",
     "good_code": "<h1>Hello Geeks</h1>"},
    {"bad_code": "<ul><li>Item1</li><li>Item2</li></ul>",
     "good_code": "<ul>\n    <li>Item1</li>\n    <li>Item2</li>\n</ul>"},
    {"bad_code": "<DIV CLASS='container'>Content</DIV>",
     "good_code": "<div class=\"container\">Content</div>"},
    {"bad_code": "<a HREF='link.html'>Click</A>",
     "good_code": "<a href=\"link.html\">Click</a>"}
]

# Configuration
MODEL_NAME = "t5-small"
EPOCHS = 15
BATCH_SIZE = 2
MAX_LENGTH = 512

# Initialize Model and Tokenizer
tokenizer = T5Tokenizer.from_pretrained(
    MODEL_NAME,
    model_max_length=MAX_LENGTH,
    truncation=True,
    padding='max_length',
    extra_ids=0
)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

def preprocess_data(data):
    """Create input-output pairs with error prompts"""
    inputs = []
    outputs = []
    for sample in data:
        inputs.append(f"fix html: {sample['bad_code']}")
        outputs.append(sample['good_code'])
    return inputs, outputs

def validate_html(html):
    """Basic HTML validation using BeautifulSoup"""
    try:
        BeautifulSoup(html, 'html.parser')
        return True
    except:
        return False

class HTMLDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        source = tokenizer(
            self.inputs[idx],
            max_length=MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        target = tokenizer(
            self.outputs[idx],
            max_length=MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }

def train_model():
    """Enhanced training loop with batching and validation"""
    inputs, outputs = preprocess_data(data)
    dataset = HTMLDataset(inputs, outputs)
    loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

    best_loss = float('inf')
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0

        for batch in loader:
            optimizer.zero_grad()

            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )

            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(loader)
        print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), "best_model.pth")

def post_process(html):
    """Clean up generated HTML"""
    html = re.sub(r'</?\w+', lambda m: m.group(0).lower(), html)
    html = re.sub(r'</?(html|head|body)>', '', html)

    self_closing_tags = ['img', 'br', 'hr', 'meta', 'link']
    for tag in self_closing_tags:
        html = re.sub(fr'<{tag}(.*?)(?<!/)>', fr'<{tag}\1/>', html)

    if validate_html(html):
        return BeautifulSoup(html, 'html.parser').prettify()
    return html

def normalize_html(html):
    """Normalize HTML for comparison"""
    soup = BeautifulSoup(html, 'html.parser')
    normalized = str(soup).lower()
    normalized = re.sub(r'\s+', '', normalized)
    return normalized

def correct_html(html_code):
    """Generate corrected HTML with post-processing"""
    input_text = f"fix html: {html_code}"
    input_ids = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=MAX_LENGTH,
        truncation=True
    ).input_ids

    outputs = model.generate(
        input_ids,
        max_length=MAX_LENGTH,
        num_beams=5,
        early_stopping=True,
        repetition_penalty=2.5,
        length_penalty=1.2
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return post_process(decoded)

if __name__ == "__main__":
    # Split data (for proper evaluation)
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

    # Train the model
    train_model()

    # Load best model
    model.load_state_dict(torch.load("best_model.pth"))

    # Prepare test data
    test_pairs = [(sample['bad_code'], sample['good_code']) for sample in test_data]

    # Evaluation metrics
    total = len(test_pairs)
    correct = 0
    total_time = 0.0

    for bad_html, expected_good in test_pairs:
        start_time = time.time()
        generated = correct_html(bad_html)
        total_time += time.time() - start_time

        norm_gen = normalize_html(generated)
        norm_exp = normalize_html(expected_good)

        if norm_gen == norm_exp:
            correct += 1

        print(f"\nInput HTML: {bad_html}")
        print(f"Generated HTML: {generated}")
        print(f"Expected HTML: {expected_good}")
        print("-" * 60)

    # Calculate and display metrics
    accuracy = (correct / total) * 100
    avg_time = total_time / total

    print(f"\nEvaluation Results:")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Average Inference Time: {avg_time:.4f} seconds")
    print(f"Test Cases Processed: {total}")

    # Example new input prediction
    new_input = "<DIV CLASS='header'><img src='logo.jpg'></DIV>"
    print(f"\nNew Input Prediction for: {new_input}")
    print(correct_html(new_input))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/15 - Loss: 14.3240
Epoch 2/15 - Loss: 13.0172
Epoch 3/15 - Loss: 11.4250
Epoch 4/15 - Loss: 10.8667
Epoch 5/15 - Loss: 10.6112
Epoch 6/15 - Loss: 12.3226
Epoch 7/15 - Loss: 10.1110
Epoch 8/15 - Loss: 8.6645
Epoch 9/15 - Loss: 6.4668
Epoch 10/15 - Loss: 5.7969
Epoch 11/15 - Loss: 5.9201
Epoch 12/15 - Loss: 5.4761
Epoch 13/15 - Loss: 6.0711
Epoch 14/15 - Loss: 5.7109
Epoch 15/15 - Loss: 4.1510


  model.load_state_dict(torch.load("best_model.pth"))



Input HTML: <SECTION>
<p>This is a paragraph.</p>
</SECTION>
Generated HTML: not_duplicate

Expected HTML: <section>
<p>This is a paragraph.</p>
</section>
------------------------------------------------------------


  BeautifulSoup(html, 'html.parser')
  return BeautifulSoup(html, 'html.parser').prettify()
  soup = BeautifulSoup(html, 'html.parser')



Input HTML: <ul><li>Item1</li><li>Item2</li></ul>
Generated HTML: Fal&gt;&gt;li&gt;Item1/li&gt;li&gt;li&gt;/li&gt;/ul&gt; True

Expected HTML: <ul>
    <li>Item1</li>
    <li>Item2</li>
</ul>
------------------------------------------------------------

Evaluation Results:
Accuracy: 0.00%
Average Inference Time: 1.1137 seconds
Test Cases Processed: 2

New Input Prediction for: <DIV CLASS='header'><img src='logo.jpg'></DIV>
html

