### Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Functions

In [None]:
def get_df(data):
    rows = []
    for commit in data.edits:
        for edit in commit:
            if edit["src"]["lang"] == "eng" and edit["is_typo"]:
                text = edit["src"]["text"]
                target = edit["tgt"]["text"]
                rows.append({"text": text, "target": target})

    return pd.DataFrame(rows)


def write_column_to_file(series, filename):
    with open(filename, 'w', encoding='utf-8', buffering=8192) as f:
        for chunk in series.astype(str):
            f.write(chunk.strip() + '\n')


def count_lines(filename):
    with open(filename, 'r', encoding='utf-8', newline='\n') as f:
        return sum(1 for _ in f)

### Data extraction

In [None]:
path = "typo_corpus/github-typo-corpus.v1.0.0.jsonl"
data = pd.read_json(path, lines=True)

In [None]:
df = get_df(data)
Xtrain, Xtest, ytrain, ytest = train_test_split(df.text, df.target, test_size=0.2, random_state=42)
print(f"Train dimension, X: {Xtrain.shape}, y: {ytrain.shape}")
print(f"Test dimension, X: {Xtest.shape}, y: {ytest.shape}")

In [None]:
write_column_to_file(Xtrain, 'train_corrupt.txt')
write_column_to_file(ytrain, 'train_clean.txt')
write_column_to_file(Xtest, 'test_corrupt.txt')
write_column_to_file(ytest, 'test_clean.txt')

In [None]:
lines_train_corrupt = count_lines('train_corrupt.txt')
lines_train_clean = count_lines('train_clean.txt')
lines_test_corrupt = count_lines('test_corrupt.txt')
lines_test_clean = count_lines('test_clean.txt')
print(f"Train lines match: {lines_train_corrupt == lines_train_clean}")
print(f"Test lines match: {lines_test_corrupt == lines_test_clean}")
print(f"Train lines: {lines_train_corrupt}")
print(f"Test lines: {lines_test_corrupt}")