In [None]:
import ndjson
import os
import json
import random
import math
import re
from functools import partial
from tqdm import tqdm
import matplotlib.pyplot as plt

# The Stack Code

## Full Dataset Statistics

In [None]:
cumsize = 0
cumtokens = 0
with open("../meta_json/stats.json") as f: 
    stats = json.load(f)
    
for key in stats:
    print(key.upper())
    tokens = stats[key]["tokens"]/10**9
    cumtokens += tokens
    print(f"tokens: {tokens:.4f} B")
    size = stats[key]["size"]/10**9
    cumsize += size
    print(f"size: {size:.4f} GB\n")

print("CUMULATIVE:")
print(f"tokens: {cumtokens:.4f} B")
print(f"size: {cumsize:.4f} GB\n")

In [None]:
pairs = [(key.title(), stats[key]["tokens"]) for key in stats]

pairs = sorted(pairs, key = lambda x: -x[1])

plt.bar([x[0] for x in pairs], [x[1] for x in pairs])
plt.ylabel('Tokens')
# plt.yscale('log')
plt.xticks(rotation=-90)
plt.show()

## Defining generic data analysis utilities

In [None]:
def numerical_density(ex):
    # The ratio of digit characters over non-digit characters in the file
    txt = ''.join(ex["text"].split())
    ntoks = sum(txt.count(c) for c in "0123456789")
    return ntoks / len(txt)

def print_ex(example): 
    text = example["text"]
    print(f"numeric density: {numerical_density(example)}")
    print(f"length (characters): {len(text)}")
    print(example["meta"]["max_stars_repo_name"])
    print(example["meta"]["max_stars_repo_path"] + "\n" + "#"*40 + "\n")
    print(text)
    
class Printer: 
    def __init__(self, data):
        self.data = data
        self.index = 0 
        self.rindex = len(data)-1
    def print_head(self): 
        print(f"index: {self.index}")
        print_ex(self.data[self.index])
        self.index += 1
    def print_tail(self): 
        print(f"index: {self.rindex}")
        print_ex(self.data[self.rindex])
        self.rindex -= 1

## Per-language analysis
Workflow for manually inspecting the quality of samples in a particular language. 

The stack has a ton of data quality issues so this is important

In [None]:
lang = "r"
shard = "0000"

In [None]:
with open(f"../data_jsonl/train/{lang}{shard}.jsonl") as f: 
    ds = ndjson.load(f)

print("len: ", len(ds))

### Random Sample Analysis

In [None]:
shuffle_ds = ds
random.shuffle(ds)
shuffle_printer = Printer(shuffle_ds)

In [None]:
shuffle_printer.print_head()

### Length analysis

In [None]:
length_ds = sorted(ds, key= lambda x: len(x["text"]))
length_printer = Printer(length_ds)

In [None]:
length_printer.print_head()

In [None]:
length_printer.print_tail()

### Numerical density analysis

In [None]:
num_ds = sorted(ds, key= lambda x: numerical_density(x))
num_printer = Printer(num_ds)

In [None]:
num_printer.print_head()

In [None]:
num_printer.rindex += 1

In [None]:
num_printer.print_tail()