In [2]:
# Importing the necessary tools and libraries
import pandas as pd
import re
from bs4 import BeautifulSoup
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score

In [3]:
# Loading the preoared dataset 
data = pd.read_csv('html.csv')
data

Unnamed: 0,Description,Bad_Practice,Proper_Practice,Bad_html,Proper_html
0,"Elements are improperly nested, causing render...",Improper Nesting of Elements,"Ensure that HTML elements are properly nested,...",<b><i>Text</b></i>,<b><i>Text</i></b>
1,Non-semantic elements are used for critical co...,Not Using Semantic HTML,"Use semantic HTML elements like <header>, <nav...","<div class=""title"">Page Title</div>",<header><h1>Page Title</h1></header>
2,Inline CSS within HTML elements makes code har...,Inline CSS,Place CSS in separate stylesheets and use clas...,"<p style=""color: red;"">Red text</p>","<p class=""red-text"">Red text</p>"
3,"Deprecated elements and attributes (e.g., <fon...",Deprecated Elements and Attributes,Use modern HTML5 elements and attributes and r...,"<font size=""3"">Text</font>","<span style=""font-size: 16px;"">Text</span>"
4,Not providing descriptive alt text for images ...,Lack of Alt Text for Images,Always include meaningful alt text for images ...,"<img src=""image.jpg"">","<img src=""image.jpg"" alt=""Description of the i..."
5,"HTML tags are left unclosed, leading to render...",Unclosed Tags,Ensure all tags are properly opened and closed.,<strong>Important text,<strong>Important text</strong>
6,"JavaScript is embedded directly within HTML, r...",Mixing HTML and JavaScript,Separate JavaScript code from HTML using exter...,"<p onclick=""alert('Clicked!')"">Click me</p>","<p id=""click-me"">Click me</p>"
7,"Tables are used for layout purposes, making co...",Excessive Use of Tables for Layout,Use CSS for layout and reserve tables for tabu...,<table><tr><td>Content</td></tr></table>,A CSS-based layout without tables for non-tabu...
8,Failure to design for mobile devices results i...,Neglecting Mobile Responsiveness,Implement responsive web design using media qu...,A non-responsive website that doesn't adapt to...,A responsive website that adjusts layout for d...
9,"Special characters like ""<"" and ""&"" are not en...",Unencoded Special Characters,Use HTML entities or character encoding to dis...,<p>5 < 3</p>,<p>5 &lt; 3</p>


In [43]:
# Defing a funtion that preprocess Html text
def preprocess_html(html_text):
    # Passing parameters which removes Html tags and special characters
    text = BeautifulSoup(html_text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [44]:
data['Bad_html'] = data['Bad_html'].apply(preprocess_html)
data['Proper_html'] = data['Proper_html'].apply(preprocess_html)


In [45]:
# Spliting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(data['Bad_html'], data['Proper_html'], test_size=0.2, random_state=42)


In [46]:
# Initializing the "BERT" Model and Tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# Tokenizing the input sequences
X_train_encodings = tokenizer(X_train.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=256)
y_train_encodings = tokenizer(y_train.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=256)


In [48]:
# Defining the function which autocorrect Html using the trained model
def autocorrect_html(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model(input_ids)
    corrected_ids = output.logits.squeeze().argmax()
    corrected_text = tokenizer.decode(corrected_ids, skip_special_tokens=True)
    return corrected_text

In [49]:
# Save the BERT model to a file
model.save_pretrained("bert_model_directory")
tokenizer.save_pretrained("bert_model_directory")

('bert_model_directory\\tokenizer_config.json',
 'bert_model_directory\\special_tokens_map.json',
 'bert_model_directory\\vocab.txt',
 'bert_model_directory\\added_tokens.json')