In [60]:
import xml.etree.ElementTree as ET
import re
import pandas as pd
import os

In [61]:
# List to store extracted paragraphs
paragraphs_new = []
data_new = []

In [62]:
def spliter_into_paragraph(root):
    paragraphs = []
    # Loop over all <p> elements in the file
    for p in root.findall(".//p"):
        # Find all <NS> tags within the paragraph
        ns_tags = p.findall(".//NS")

        # Check if at least one NS has both <i> and <c>
        for ns in ns_tags:
            i_tag = ns.find("i")
            c_tag = ns.find("c")
            if i_tag is not None and c_tag is not None:
                # Convert the paragraph (including text and tags) to a string
                paragraph_str = ET.tostring(p, encoding="unicode", method="xml")
                paragraphs.append(paragraph_str)
                break  # No need to check other NS tags for this paragraph
    return paragraphs

In [63]:
def spliting_into_right_wrong(paragraphs_):
    data = []

    # Pattern to split sentences (naive split by ., !, ?)
    sentence_split_pattern = r'(?<=[.!?])\s+'

    for para in paragraphs_:
        # Remove <p> tags
        text = re.sub(r"</?p>", "", para)
        
        # Split paragraph into sentences
        sentences = re.split(sentence_split_pattern, text)
        
        for sentence in sentences:
            # Keep only sentences with at least one NS tag containing both <i> and <c>
            if re.search(r"<NS[^>]*><i>.*?</i><c>.*?</c></NS>", sentence):
                # Extract wrong and right forms
                wrong = re.sub(r"<NS[^>]*><i>(.*?)</i><c>.*?</c></NS>", r"\1", sentence)
                right = re.sub(r"<NS[^>]*><i>.*?</i><c>(.*?)</c></NS>", r"\1", sentence)
                
                # Remove any remaining tags
                wrong = re.sub(r"<[^>]+>", "", wrong).strip()
                right = re.sub(r"<[^>]+>", "", right).strip()
                
                # Keep only if wrong != right
                if wrong != right:
                    data.append({"Wrong": wrong, "Right": right})
    return data

In [64]:
# Folder containing XML files
dataset_path = r"C:\Users\HP\Downloads\fce-released-dataset (1)\fce-released-dataset\dataset"


# Loop through each subfolder inside 'dataset'
for subfolder in os.listdir(dataset_path):
    subfolder_path = os.path.join(dataset_path, subfolder)

    # Check that itâ€™s actually a folder
    if os.path.isdir(subfolder_path):
        print(f"Processing folder: {subfolder}")

        # Loop through XML files inside the subfolder
        for filename in os.listdir(subfolder_path):
            if filename.endswith(".xml"):
                print(filename)
                file_path = os.path.join(subfolder_path, filename)

                try:
                    # Parse XML
                    tree = ET.parse(file_path)
                    root = tree.getroot()

                    # Extract paragraphs and split into right/wrong
                    paragraphs_new.extend(spliter_into_paragraph(root))

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

paragraphs_flat = [p for sublist in paragraphs_new for p in (sublist if isinstance(sublist, list) else [sublist])]

data_new.extend(spliting_into_right_wrong(paragraphs_flat))

Processing folder: 0100_2000_12
doc1000.xml
doc1002.xml
doc1018.xml
doc102.xml
doc1029.xml
doc1033.xml
doc1035.xml
doc1037.xml
doc111.xml
doc1225.xml
doc1234.xml
doc1259.xml
doc126.xml
doc1260.xml
doc1263.xml
doc1284.xml
doc1293.xml
doc1295.xml
doc1306.xml
doc1322.xml
doc1357.xml
doc1358.xml
doc1376.xml
doc1415.xml
doc1424.xml
doc1517.xml
doc1530.xml
doc1552.xml
doc1559.xml
doc1581.xml
doc1586.xml
doc1595.xml
doc1611.xml
doc1621.xml
doc1624.xml
doc1727.xml
doc178.xml
doc1811.xml
doc1823.xml
doc1840.xml
doc1850.xml
doc190.xml
doc1939.xml
doc1959.xml
doc198.xml
doc2064.xml
doc2077.xml
doc209.xml
doc2102.xml
doc2122.xml
doc2130.xml
doc2140.xml
doc2141.xml
doc216.xml
doc217.xml
doc221.xml
doc2232.xml
doc224.xml
doc2263.xml
doc2274.xml
doc2280.xml
doc2284.xml
doc2301.xml
doc2302.xml
doc2305.xml
doc236.xml
doc239.xml
doc2394.xml
doc241.xml
doc2508.xml
doc2509.xml
doc254.xml
doc2541.xml
doc2545.xml
doc2546.xml
doc258.xml
doc2599.xml
doc2611.xml
doc2612.xml
doc2630.xml
doc2638.xml
doc2644.xml


In [65]:
len(data_new)

18691

In [66]:
len(paragraphs_flat)

10065

In [67]:
paragraphs_flat

['<p>I WANT TO <NS type="S"><i>THAK</i><c>THANK</c></NS> YOU FOR PREPARING SUCH A GOOD PROGRAMME FOR US AND ESPECIALLY FOR TAKING US <NS type="RT"><i>TO</i><c>ON</c></NS> THE RIVER TRIP TO GREENWICH. I WOULD LIKE TO KNOW IF THERE IS ANY CHANCE OF CHANGING THE PROGRAMME BECAUSE WE HAVE FOUND A VERY INTERESTING ACTIVITY TO DO ON TUESDAY 14 MARCH. IT <NS type="RV"><i>CONSISTS <NS type="RT"><i>ON</i><c>IN</c></NS></i><c>INVOLVES</c></NS> VISITING THE LONDON FASHION AND LEISURE SHOW <NS type="RT"><i>IN</i><c>AT</c></NS> THE CENTRAL EXHIBITION HALL. I THINK IT\'S A GREAT OPPORTUNITY TO MAKE GREATER USE OF OUR KNOWLEDGE OF <NS type="MD"><c>THE</c></NS> ENGLISH LANGUAGE. <NS type="ID"><i>ON THE OTHER HAND</i><c>ALSO</c></NS>, WE COULD LEARN THE DIFFERENT WAYS TO GET TO THE CENTRAL EXHIBITION HALL.</p>\n          ',
 '<p>I WILL BE <NS type="RV"><i>WRITING</i><c>WAITING</c></NS> ANXIOUSLY FOR YOUR RESPONSE.</p>\n          ',
 '<p>FAMOUS PEOPLE SUCH AS SINGERS, FILM STARS, ETC<NS type="MP"><c>,</

In [68]:
# Create DataFrame
df = pd.DataFrame(data_new)

df

Unnamed: 0,Wrong,Right
0,I WANT TO THAK YOU FOR PREPARING SUCH A GOOD P...,I WANT TO THANK YOU FOR PREPARING SUCH A GOOD ...
1,IT CONSISTS ONINVOLVES VISITING THE LONDON FAS...,IT ININVOLVES VISITING THE LONDON FASHION AND ...
2,"ON THE OTHER HAND, WE COULD LEARN THE DIFFEREN...","ALSO, WE COULD LEARN THE DIFFERENT WAYS TO GET..."
3,I WILL BE WRITING ANXIOUSLY FOR YOUR RESPONSE.,I WILL BE WAITING ANXIOUSLY FOR YOUR RESPONSE.
4,"AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FA...","AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FA..."
...,...,...
18686,"I'm planning the event on 15 or 22 February, f...","I'm planning the event for 15 or 22 February, ..."
18687,"I had never taken birthdays serious, both my o...","I had never taken birthdays seriously, neither..."
18688,"When I was 23, I joined a NGO in Denmark and s...","When I was 23, I joined an NGO in Denmark and ..."
18689,The party was far beyound my expectation.,The party was far beyond my expectations.


In [71]:
df.loc[4, "Wrong"]

'AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FANS WANT TO KNOW HOW THEY ACT IN THEIR PRIVATE LIVES, WITH THEIR FAMILY OR FRIENDS.'

In [72]:
df.loc[4, "Right"]

'AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FANS WANT TO KNOW HOW THEY BEHAVE IN THEIR PRIVATE LIVES, WITH THEIR FAMILY OR FRIENDS.'

In [73]:
df.to_csv("automatic_correction_dataset.csv", index= False)