## # **Code to clean data for LLM Training**

In [3]:
%cd /content/drive/MyDrive/Research/datasets/crawled_data

/content/drive/MyDrive/Research/datasets/crawled_data


In [7]:
input_file_path = 'merged_crawled_data.csv'
output_file_path = 'processed_crawled_data.csv'

In [17]:
# Remove duplicate entries
# merge-data: list(set(list))Extending pragraphs

from collections import defaultdict
import csv
import json
import sys

# Increase the CSV field size limit
csv.field_size_limit(sys.maxsize) # otherwise it gives "Error: field larger than field limit (131072)"


def merge_lists(list1, list2):
    # Convert list1 to a set to remove duplicates and maintain order
    seen = set()
    merged = []

    for item in list1 + list2:
        if item not in seen:
            seen.add(item)
            merged.append(item)

    return merged

def de_duplicate(input_file_path, output_file_path):
    '''
    # crawled_data-format:
      {
        <url-11>:{'page_title': 'page-1', 'paragraphs': ['devanagari-paragraph-1', 'devanagari-paragraph-2']},
        <url-2>:{'page_title': page-2, 'paragraphs': ['devanagari-paragraph-2', 'devanagari-paragraph-3']}
      }

      * paragraphs are jsonified list i.e. saved after performing `json.dumps(paragraphs)`
    '''

    # Using list for paragraphs because set dont preserve order.
    data = defaultdict(lambda: {'page_title': None, 'paragraphs': []})

    # Read the input CSV file
    with open(input_file_path, 'r', newline='', encoding='utf-8') as input_file:
        reader = csv.DictReader(input_file)

        for row in reader:
            if 'paragraphs' in row.keys():
                # Newer data format
                parent_url = row['parent_url']
                page_title = row['page_title']
                paragraphs = row['paragraphs']

                if parent_url not in data:
                    # set page_title for new data item
                    data[parent_url]['page_title'] = page_title

                if  type(data[parent_url]['paragraphs']) == list:
                    # in case of data[parent_url]['paragraphs'] has already gone througn `json.loads` previously
                    existing_paragraphs = data[parent_url]['paragraphs']
                else:
                    existing_paragraphs = json.loads(data[parent_url]['paragraphs'])

                try:
                  data[parent_url]['paragraphs'] = merge_lists(existing_paragraphs, json.loads(paragraphs))
                except json.JSONDecodeError as Ex:
                  print(Ex)
                  '''
                    * previous data format used to have `paragraph` as non-jsonified string.
                    * while new version of code stores paragraphs as jsonified list
                    * This is invoked means: There is some spider running with previous version of code
                    * `json.loads` for non-jsonified list should be reason behind this Exception
                    * if 'paragraph' in row.keys():
                  '''
                  print('\n------------------------------------------------------------------------------------')
                  print(f'\t row contain <str> paragraph. Newer version of code is supposed to have <list>paragraphs')
                  print('------------------------------------------------------------------------------------')

                  paragraph = row['paragraphs']

                  if paragraph not in data[parent_url]['paragraphs']:
                      # append paragraph
                      # set would give better time complexity but they dont preserve order.
                      data[parent_url]['paragraphs'].append(paragraph)

    # Write the consolidated data to the output CSV file
    with open(output_file_path, 'w', newline='', encoding='utf-8') as output_file:
        fieldnames = ['parent_url', 'page_title', 'paragraphs']
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)

        writer.writeheader()
        for parent_url, content in data.items():
            writer.writerow({
                'parent_url': parent_url,
                'page_title': content['page_title'],
                'paragraphs': json.dumps(content['paragraphs'])  # Convert set to list before dumping to JSON
            })

# Run the consolidation function
de_duplicate(input_file_path, output_file_path)

print(f"Consolidated CSV file has been created at {output_file_path}")

Expecting value: line 1 column 2 (char 1)

------------------------------------------------------------------------------------
	 row contain paragraph. Not supposed to be in newer batch of
------------------------------------------------------------------------------------
Expecting value: line 2 column 21 (char 21)

------------------------------------------------------------------------------------
	 row contain paragraph. Not supposed to be in newer batch of
------------------------------------------------------------------------------------
Expecting value: line 1 column 2 (char 1)

------------------------------------------------------------------------------------
	 row contain paragraph. Not supposed to be in newer batch of
------------------------------------------------------------------------------------
Expecting value: line 1 column 2 (char 1)

------------------------------------------------------------------------------------
	 row contain paragraph. Not supposed to be i

## Examples

In [1]:
# Merging two lists
# Application: while de-duplicating there may be two urls with two <list>paragraphs which needs to be merged
'''
* (preserving order)
* (should not contain duplicate item)
* (all elements of first list come first)
'''
def merge_lists(list1, list2):
    # Convert list1 to a set to remove duplicates and maintain order
    seen = set()
    merged = []

    for item in list1 + list2:
        if item not in seen:
            seen.add(item)
            merged.append(item)

    return merged

# Example lists
list1 = [1, 3, 3, 5, 7]
list2 = [2, 4, 6]

# Merging the lists
merged = merge_lists(list1, list2)

print(merged)  # Output: [1, 3, 5, 7, 2, 4, 6]


[1, 3, 5, 7, 2, 4, 6]


## References:
* [fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)