### Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Functions

In [6]:
def get_df(_data):
    rows = []
    for commit in _data.edits:
        for edit in commit:
            if edit["src"]["lang"] == "eng" and edit["is_typo"]:
                text = edit["src"]["text"]
                target = edit["tgt"]["text"]
                rows.append({"text": text, "target": target})

    return pd.DataFrame(rows)


def write_column_to_file(series, filename, separator='\n'):
    with open(filename, 'w', encoding='utf-8', buffering=8192) as f:
        for chunk in series.astype(str):
            f.write(chunk.strip() + separator)


def count_lines(filename):
    with open(filename, 'r', encoding='utf-8', newline='\n') as f:
        return sum(1 for _ in f)

### Data extraction

In [3]:
# https://github-typo-corpus.s3.amazonaws.com/data/github-typo-corpus.v1.0.0.jsonl.gz
path = "typo_corpus/github-typo-corpus.v1.0.0.jsonl"
data = pd.read_json(path, lines=True)

In [4]:
df = get_df(data)
Xtrain, Xtest, ytrain, ytest = train_test_split(df.text, df.target, test_size=0.2, random_state=42)
print(f"Train dimension, X: {Xtrain.shape}, y: {ytrain.shape}")
print(f"Test dimension, X: {Xtest.shape}, y: {ytest.shape}")

Train dimension, X: (204044,), y: (204044,)
Test dimension, X: (51012,), y: (51012,)


In [7]:
DATAPATH = "./data/"
write_column_to_file(ytrain, DATAPATH + 'corpus.txt', separator=' ')

In [9]:
DATAPATH = "./data/"
write_column_to_file(Xtrain, DATAPATH + 'train_corrupt.txt')
write_column_to_file(ytrain, DATAPATH + 'train_clean.txt')
write_column_to_file(Xtest, DATAPATH + 'test_corrupt.txt')
write_column_to_file(ytest, DATAPATH + 'test_clean.txt')

In [10]:
lines_train_corrupt = count_lines(DATAPATH + 'train_corrupt.txt')
lines_train_clean = count_lines(DATAPATH + 'train_clean.txt')
lines_test_corrupt = count_lines(DATAPATH + 'test_corrupt.txt')
lines_test_clean = count_lines(DATAPATH + 'test_clean.txt')
print(f"Train lines match: {lines_train_corrupt == lines_train_clean}")
print(f"Test lines match: {lines_test_corrupt == lines_test_clean}")
print(f"Train lines: {lines_train_corrupt}, is same as train dimension: {lines_train_clean == Xtrain.shape[0]}")
print(f"Test lines: {lines_test_corrupt}, is same as test dimension: {lines_test_clean == Xtest.shape[0]}")

Train lines match: True
Test lines match: True
Train lines: 204044, is same as train dimension: True
Test lines: 51012, is same as test dimension: True
