In [22]:
import pandas as pd
from datasets import load_dataset
import os
import pathlib

In [23]:
sermons = load_dataset("json", data_files="sermons.json", split="train")
sermons

Dataset({
    features: ['minister', 'channel_name', 'title', 'description', 'views', 'length', 'transcript', 'source'],
    num_rows: 3000
})

In [24]:
# os.mkdir("data_files")

dataset = sermons.filter(lambda x: len(x["transcript"]) > 10)
len(dataset)

2998

In [26]:
# Normalizing the texts

import re

filler_words = ["um", "ah", "uh","hm", "amen"]
pattern = r'\b(?:' + '|'.join(filler_words) + r')\b'

def remove_escapes(example):
    cleaned_text = [x.replace("\xa0", " ").replace("\n", " ") for x in example["transcript"]]
    cleaned_text = [re.sub(pattern,'', text).strip() for text in cleaned_text]
    cleaned_text = [re.sub(r"\s+", ' ', x).strip() for x in cleaned_text]
    cleaned_text = [re.sub(r"\[.*?\]", "", text).strip() for text in cleaned_text]
    example["transcript"] = cleaned_text
    return example

remove_escapes = dataset.map(remove_escapes, batched=True, num_proc=3)

Map (num_proc=3): 100%|██████████| 2998/2998 [00:03<00:00, 810.43 examples/s]


In [27]:
for i in range(0, len(remove_escapes), 100):
    end_index = min(i + 100, len(remove_escapes))
    chunk = remove_escapes.select(range(i, end_index))
    chunk.to_json(f"data_files/data_{i}.jsonl", orient="records", lines=True)


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 13.49ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 13.34ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 20.86ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 25.11ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 45.19ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 53.42ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 45.09ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 190.02ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 176.54ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 167.71ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 97.03ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 25.02ba/s]
Creating json from Arrow format: 100%|██████████|

In [28]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(example):
    words = example["transcript"].split()
    # Remove None valuess
    corrected_words = [
        spell.correction(word) if spell.correction(word) and word not in spell else word 
        for word in words
    ]
    standard_words = " ".join(corrected_words)
    example["transcript"] = standard_words
    return example
# [1,2500,2200, 1300, 900,2600,1100, 300, 1400, 1600, 1800, 600, 100, 1700, 1200]
# os.mkdir("corrected_datafiles")
# data_files = os.listdir("data_files")
# print(data_files)
for i in range(len(data_files)):
    print(data_files[i])
    data = load_dataset("json", data_files=f"data_files/{data_files[i]}", split="train")
    testd = data.map(correct_spelling, batched=False, num_proc=16)
    testd.to_json(f"corrected_datafiles/data_{i}.jsonl", orient="records", lines=True)

['data_1.jsonl', 'data_2500.jsonl', 'data_2200.jsonl', 'data_1300.jsonl', 'data_900.jsonl', 'data_2600.jsonl', 'data_1100.jsonl', 'data_300.jsonl', 'data_1400.jsonl', 'data_1600.jsonl', 'data_1800.jsonl', 'data_600.jsonl', 'data_100.jsonl', 'data_1700.jsonl', 'data_1200.jsonl', 'data_2000.jsonl', 'data_2700.jsonl', 'data_200.jsonl', 'data_2800.jsonl', 'data_700.jsonl', 'data_2900.jsonl', 'data_500.jsonl', 'data_1000.jsonl', 'data_1500.jsonl', 'data_1900.jsonl', 'data_2100.jsonl', 'data_800.jsonl', 'data_2300.jsonl', 'data_2400.jsonl', 'data_0.jsonl', 'data_400.jsonl']
data_1.jsonl


Generating train split: 100 examples [00:00, 3478.82 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [03:09<00:00,  1.90s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 37.97ba/s]


data_2500.jsonl


Generating train split: 100 examples [00:00, 6339.35 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [03:32<00:00,  2.12s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 40.69ba/s]


data_2200.jsonl


Generating train split: 100 examples [00:00, 15692.55 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [02:35<00:00,  1.55s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 67.27ba/s]


data_1300.jsonl


Generating train split: 100 examples [00:00, 8964.87 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [07:43<00:00,  4.63s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 34.29ba/s]


data_900.jsonl


Generating train split: 100 examples [00:00, 17982.78 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [01:30<00:00,  1.11 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 183.25ba/s]


data_2600.jsonl


Generating train split: 100 examples [00:00, 16099.74 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [02:02<00:00,  1.22s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 86.22ba/s]


data_1100.jsonl


Generating train split: 100 examples [00:00, 3341.62 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [10:21<00:00,  6.22s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 22.61ba/s]


data_300.jsonl


Generating train split: 100 examples [00:00, 2740.23 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [12:42<00:00,  7.62s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 12.32ba/s]


data_1400.jsonl


Generating train split: 100 examples [00:00, 15032.81 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [04:07<00:00,  2.48s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 28.86ba/s]


data_1600.jsonl


Generating train split: 100 examples [00:00, 2426.77 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [15:24<00:00,  9.24s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  8.98ba/s]


data_1800.jsonl


Generating train split: 100 examples [00:00, 2378.22 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [15:19<00:00,  9.19s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 18.78ba/s]


data_600.jsonl


Generating train split: 100 examples [00:00, 6381.30 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [06:01<00:00,  3.62s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 31.44ba/s]


data_100.jsonl


Generating train split: 100 examples [00:00, 1289.33 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [16:39<00:00, 10.00s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 11.24ba/s]


data_1700.jsonl


Generating train split: 100 examples [00:00, 2551.08 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [16:41<00:00, 10.01s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.78ba/s]


data_1200.jsonl


Generating train split: 100 examples [00:00, 3136.89 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [11:39<00:00,  7.00s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 11.60ba/s]


data_2000.jsonl


Generating train split: 100 examples [00:00, 2686.30 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [12:09<00:00,  7.30s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 19.62ba/s]


data_2700.jsonl


Generating train split: 100 examples [00:00, 8608.11 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [03:04<00:00,  1.84s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 42.27ba/s]


data_200.jsonl


Generating train split: 100 examples [00:00, 1264.02 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [18:06<00:00, 10.87s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 13.86ba/s]


data_2800.jsonl


Generating train split: 100 examples [00:00, 10818.43 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [04:06<00:00,  2.46s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 19.37ba/s]


data_700.jsonl


Generating train split: 100 examples [00:00, 36573.98 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [00:19<00:00,  5.02 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 245.51ba/s]


data_2900.jsonl


Generating train split: 98 examples [00:00, 12399.82 examples/s]
Map (num_proc=16): 100%|██████████| 98/98 [08:13<00:00,  5.04s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 63.75ba/s]


data_500.jsonl


Generating train split: 100 examples [00:00, 11434.85 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [03:19<00:00,  1.99s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 59.11ba/s]


data_1000.jsonl


Generating train split: 100 examples [00:00, 16234.34 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [01:36<00:00,  1.04 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 114.50ba/s]


data_1500.jsonl


Generating train split: 100 examples [00:00, 11277.44 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [04:04<00:00,  2.45s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 32.15ba/s]


data_1900.jsonl


Generating train split: 100 examples [00:00, 5175.79 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [05:18<00:00,  3.18s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 33.62ba/s]


data_2100.jsonl


Generating train split: 100 examples [00:00, 5437.97 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [05:30<00:00,  3.30s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 32.56ba/s]


data_800.jsonl


Generating train split: 100 examples [00:00, 12093.95 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [01:23<00:00,  1.20 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 85.76ba/s]


data_2300.jsonl


Generating train split: 100 examples [00:00, 6876.92 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [03:58<00:00,  2.38s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 44.67ba/s]


data_2400.jsonl


Generating train split: 100 examples [00:00, 4211.95 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [05:38<00:00,  3.38s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 18.03ba/s]


data_0.jsonl


Generating train split: 100 examples [00:00, 1541.67 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [15:26<00:00,  9.26s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  8.84ba/s]


data_400.jsonl


Generating train split: 100 examples [00:00, 5287.96 examples/s]
Map (num_proc=16): 100%|██████████| 100/100 [06:02<00:00,  3.63s/ examples]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 34.07ba/s]
