In [1]:
# Importing the necessary tools and libraries
import pandas as pd
import difflib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Loading the provided data
data = pd.read_csv('html.csv')
data

Unnamed: 0,Description,Bad_Practice,Proper_Practice,Bad_html,Proper_html
0,"Elements are improperly nested, causing render...",Improper Nesting of Elements,"Ensure that HTML elements are properly nested,...",<b><i>Text</b></i>,<b><i>Text</i></b>
1,Non-semantic elements are used for critical co...,Not Using Semantic HTML,"Use semantic HTML elements like <header>, <nav...","<div class=""title"">Page Title</div>",<header><h1>Page Title</h1></header>
2,Inline CSS within HTML elements makes code har...,Inline CSS,Place CSS in separate stylesheets and use clas...,"<p style=""color: red;"">Red text</p>","<p class=""red-text"">Red text</p>"
3,"Deprecated elements and attributes (e.g., <fon...",Deprecated Elements and Attributes,Use modern HTML5 elements and attributes and r...,"<font size=""3"">Text</font>","<span style=""font-size: 16px;"">Text</span>"
4,Not providing descriptive alt text for images ...,Lack of Alt Text for Images,Always include meaningful alt text for images ...,"<img src=""image.jpg"">","<img src=""image.jpg"" alt=""Description of the i..."
5,"HTML tags are left unclosed, leading to render...",Unclosed Tags,Ensure all tags are properly opened and closed.,<strong>Important text,<strong>Important text</strong>
6,"JavaScript is embedded directly within HTML, r...",Mixing HTML and JavaScript,Separate JavaScript code from HTML using exter...,"<p onclick=""alert('Clicked!')"">Click me</p>","<p id=""click-me"">Click me</p>"
7,"Tables are used for layout purposes, making co...",Excessive Use of Tables for Layout,Use CSS for layout and reserve tables for tabu...,<table><tr><td>Content</td></tr></table>,A CSS-based layout without tables for non-tabu...
8,Failure to design for mobile devices results i...,Neglecting Mobile Responsiveness,Implement responsive web design using media qu...,A non-responsive website that doesn't adapt to...,A responsive website that adjusts layout for d...
9,"Special characters like ""<"" and ""&"" are not en...",Unencoded Special Characters,Use HTML entities or character encoding to dis...,<p>5 < 3</p>,<p>5 &lt; 3</p>


In [58]:
# Initializing empty corrected Html
corrected_html = []

In [59]:
# Defining  function that corrects Html based on the "Bad_html" and "Proper_html" columns of our dataset
def correct_html(bad_html, proper_html):
    # Computing the differences between "Bad_html" and "Proper_html"
    d = difflib.Differ()
    diff = list(d.compare(bad_html, proper_html))

    # Initialize an empty corrected HTML
    corrected_html = []

    # Process the differences and applying corrections
    for item in diff:
        if item.startswith('- '):
            # This will remove any content present in "Bad_html" but not in "Proper_html"
            pass
        elif item.startswith('+ '):
            # Adding content from "Proper_html" not present in "Bad_html"
            corrected_html.append(item[2:])
        elif not item.startswith('? '):
            # it will keep common content and unchanged parts
            corrected_html.append(item[2:])

    # This will combine the corrected parts to form the corrected HTML
    corrected_html = ''.join(corrected_html)

    return corrected_html


In [60]:
# Applying the correction function to the "Bad_html" and "Proper_html" columns
data['Corrected_html'] = data.apply(lambda row: correct_html(row['Bad_html'], row['Proper_html']), axis=1)


In [61]:
# Calculating accuracy, precision, recall, and F1 score
accuracy = accuracy_score(data['Proper_html'], data['Corrected_html'])
precision = precision_score(data['Proper_html'], data['Corrected_html'], average='weighted')
recall = recall_score(data['Proper_html'], data['Corrected_html'], average='weighted')
f1 = f1_score(data['Proper_html'], data['Corrected_html'], average='weighted')


In [62]:
# Second line will save the corrected data to a new CSV file
data.to_csv('corrected_html_data.csv', index=False)
print("HTML correction completed. Corrected data saved to 'corrected_html_data.csv'.")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

HTML correction completed. Corrected data saved to 'corrected_html_data.csv'.
Accuracy: 100.00%
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


In [63]:
data.to_csv('corrected_html_data.csv', index=False)