In [68]:
from datasets import load_dataset

In [69]:
data_dataset = load_dataset("json", data_files="transcriptions_cleaned.json", split="train")
data_dataset

Dataset({
    features: ['cleaned_transcripts', 'label', 'transcript'],
    num_rows: 407
})

In [70]:
data_sample = data_dataset.shuffle(seed=124).select(range(50))
data_sample[:3]

{'cleaned_transcripts': ["Right is a checklist. You know that you are successful spiritually if all of these things are working in your life. And I'm asking you now: be brutally honest with yourself. Remember my Preamble before we started: you cannot be transformed, you cannot grow if you are not brutally honest. Can you honestly tell yourself, on a scale of 1 to 10, that you have a hunger level for God Beyond seven, less than seven? You are still playing. There is room to step up in. F fact, you get to a certain realm in the spirit where, if it is not at least 8 or 9, uh, there are certain things God cannot commit to you. Number two: your prayer life. Are we together? Some of us can wait for 7 hours in the office of someone, maybe a politician, and you are not tired. The man will come out in your presence. Ah, you are still here, sorry. Ah, no problem, Chief. How about you again? I'm, I'm okay. You've not eaten from morning till.",
  'Speak over you that before this year Runs Out. In 

In [71]:
data_dataset = data_dataset.filter(lambda x: x["cleaned_transcripts"] is not None)
data_dataset

Dataset({
    features: ['cleaned_transcripts', 'label', 'transcript'],
    num_rows: 407
})

In [72]:
# function to make lower the cleaned transcripts
def cleaned_transcripts_lower(example):
    return {"clean_transcripts": example["cleaned_transcripts"].lower()}

In [73]:
# get the length of transcripts text
def compute_transcripts_length(example):
    return {"transcripts_length": len(example["cleaned_transcripts"].split())}

In [74]:
transcripts_dataset = data_dataset.map(compute_transcripts_length)
transcripts_dataset[0]

{'cleaned_transcripts': "No matter how close you are to Destiny, if you don't finish, you will stay in the same group with those who did not start. You have come too far. Are we together? You've held a  plough ready for Harvest, in Ministry, in business, refuse to be distracted. The world is full of noise makers, naysayers, people full of pain- they may not be Wicked people, just wounded and Confused people hoping they can use your pain to find meaning to their lives.",
 'label': 0,
 'transcript': "no matter how close you are to Destiny if you don't finish you will stay in the same group with those who did not start you have come too far are we together you've held a  plough ready for Harvest in Ministry in business refuse to be distracted the world is full of noise makers naysayers people full of pain they may not be Wicked people just wounded and Confused people hoping they can use your pain to find meaning to their lives",
 'transcripts_length': 83}

In [75]:
transcripts_dataset.sort("transcripts_length")[:3]

{'cleaned_transcripts': ['What makes the doings of God God compliant is that it must be word compliant.',
  'When he killed children, it was not about children. He perceived they were saviors in the children.',
  'The proof of Mastery is that you can teach without ambiguity anything you cannot explain. You have not gained Mastery over.'],
 'label': [239, 377, 25],
 'transcript': ['what makes the doings of God God compliant is that it must be word compliant',
  'when he killed children it was not about children he perceived they were saviors in the children',
  'the proof of Mastery is that you can teach without ambiguity anything you cannot explain you have not gained Mastery over'],
 'transcripts_length': [15, 17, 21]}

In [76]:
transcripts_dataset = transcripts_dataset.filter(lambda x: x["transcripts_length"] > 10)
print(transcripts_dataset.num_rows)

407


In [77]:
import html

transcripts_dataset = transcripts_dataset.map(lambda x: {"cleaned_transcripts" : html.unescape(x["cleaned_transcripts"])})
transcripts_dataset

Dataset({
    features: ['cleaned_transcripts', 'label', 'transcript', 'transcripts_length'],
    num_rows: 407
})

In [78]:
transcripts_dataset = transcripts_dataset.remove_columns(column_names=["transcript"])

In [79]:
transcripts_dataset = transcripts_dataset.rename_column(original_column_name="cleaned_transcripts", new_column_name="transcript")

In [80]:
transcripts_dataset

Dataset({
    features: ['transcript', 'label', 'transcripts_length'],
    num_rows: 407
})

In [81]:
%%time
transcripts_dataset = transcripts_dataset.map(
    lambda x: {"transcript": [html.unescape(o) for o in x["transcript"]]}, batched=True
)

CPU times: user 5.93 ms, sys: 856 μs, total: 6.78 ms
Wall time: 5.24 ms


In [82]:
transcripts_dataset

Dataset({
    features: ['transcript', 'label', 'transcripts_length'],
    num_rows: 407
})

In [83]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(example):
    return tokenizer(example["transcript"], truncation=True)

In [84]:
%%time

tokenized_dataset = transcripts_dataset.map(tokenize_function, batched=True, num_proc=2)

Map (num_proc=2): 100%|██████████| 407/407 [00:00<00:00, 1222.43 examples/s]

CPU times: user 52.1 ms, sys: 59.7 ms, total: 112 ms
Wall time: 419 ms





In [85]:
def tokenize_and_split(example):
    return tokenizer(
        example["transcript"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

In [86]:
result = tokenize_and_split(transcripts_dataset[2])

[len(inp) for inp in result["input_ids"]]

[128, 54]

In [87]:
transcripts_dataset

Dataset({
    features: ['transcript', 'label', 'transcripts_length'],
    num_rows: 407
})

In [88]:
tokenized_dataset = transcripts_dataset.map(tokenize_and_split, batched=True)

# Error because, the tokenized dataset expects more entries than is in the original transcript dataset, as a result of overflow_tokens after tokenization

Map:   0%|          | 0/407 [00:00<?, ? examples/s]


ArrowInvalid: Column 3 named input_ids expected length 407 but got length 703

In [89]:
tokenized_dataset = transcripts_dataset.map(
    tokenize_and_split,
    batched=True,
    remove_columns=transcripts_dataset.column_names
)

Map: 100%|██████████| 407/407 [00:00<00:00, 1527.92 examples/s]


In [90]:
len(tokenized_dataset), len(transcripts_dataset)

(703, 407)

In [91]:
def tokenize_and_split(example):
    result = tokenizer(
        example["transcript"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )
    
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in example.items():
        print(key, values)
        result[key] = [values[i] for i in sample_map]
    return result

In [92]:
tokenized_dataset = transcripts_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

Map: 100%|██████████| 407/407 [00:00<00:00, 1324.65 examples/s]

label [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220




Dataset({
    features: ['transcript', 'label', 'transcripts_length', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 703
})

In [101]:
transcripts_dataset.set_format(type="pandas")

In [103]:
trans_df = transcripts_dataset

In [104]:
transcripts_dataset.reset_format()

In [109]:
transcripts_dataset = transcripts_dataset.train_test_split(train_size=.8, seed=43)
transcripts_dataset

DatasetDict({
    train: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 325
    })
    test: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 82
    })
})

In [110]:
transcripts_dataset_clean = transcripts_dataset["train"].train_test_split(train_size=.9, seed=43)
transcripts_dataset_clean["validation"] = transcripts_dataset_clean.pop("test")
transcripts_dataset_clean["test"] = transcripts_dataset["test"]

transcripts_dataset_clean



DatasetDict({
    train: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 292
    })
    validation: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 33
    })
    test: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 82
    })
})

In [111]:
transcripts_dataset_clean.save_to_disk("transcripts")

Saving the dataset (1/1 shards): 100%|██████████| 292/292 [00:00<00:00, 19721.69 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33/33 [00:00<00:00, 4673.24 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 82/82 [00:00<00:00, 7324.11 examples/s]


In [112]:
from datasets import load_from_disk

trans_df_reload = load_from_disk("transcripts")
trans_df_reload

DatasetDict({
    train: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 292
    })
    validation: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 33
    })
    test: Dataset({
        features: ['transcript', 'label', 'transcripts_length'],
        num_rows: 82
    })
})

In [114]:
# for split, dataset in trans_df_reload.items():
#     dataset.to_json(f"transcript-{split}.jsonl")