In [None]:
import ndjson
import os
import json
import random
import math
import re
from functools import partial
from tqdm import tqdm
import matplotlib.pyplot as plt

## Defining generic data analysis utilities

In [None]:
def numerical_density(ex):
    # The ratio of digit characters over non-digit characters in the file
    txt = ''.join(ex["text"].split())
    ntoks = sum(txt.count(c) for c in "0123456789")
    return ntoks / max(len(txt), 1)

def alphabetic_density(ex): 
    txt = ''.join(ex["text"].split())
    nalpha = len([x for x in txt if x.isalpha()])
    return nalpha/max(len(txt), 1)

def print_ex(example): 
    text = example["text"]
    if "source" in example["meta"]: 
        print(example["meta"]["source"])
    if "commit_subject" in example["meta"]: 
        print("Github Diffs")
    print(f"numeric density: {numerical_density(example)}")
    print(f"alphabetic density: {alphabetic_density(example)}")
    print(f"length (bytes): {len(text.encode('utf-8'))}")
    if "max_stars_repo_name" in example["meta"]:
        print(example["meta"]["max_stars_repo_name"])
    if "max_stars_repo_path" in example["meta"]:
        print(example["meta"]["max_stars_repo_path"]) 
    if "repo_name_with_owner" in example["meta"]:
        print(example["meta"]["repo_name_with_owner"])
    if "repo_name" in example["meta"]: 
        print(example["meta"]["repo_name"])
    
    print("#"*40 + "\n")
    print(text)

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

def data_viewer(your_list):
    # Create the slider
    slider = widgets.IntSlider(min=0, max=len(your_list)-1, step=1, description='Index:')

    # Create buttons
    next_button = widgets.Button(description="Next")
    previous_button = widgets.Button(description="Previous")

    # Define button click events
    def next_element(b):
        slider.value = min(slider.value + 1, len(your_list) - 1)

    def previous_element(b):
        slider.value = max(slider.value - 1, 0)

    # Assign button click events
    next_button.on_click(next_element)
    previous_button.on_click(previous_element)

    # Create the output widget
    output = widgets.Output()

    # Event handler for updating text widget when slider changes
    def update_text(change):
        with output:
            clear_output(wait=True)
            print_ex(your_list[slider.value])

    # Listen to the slider value change
    slider.observe(update_text, 'value')

    # Layout
    ui = widgets.VBox([slider, next_button, previous_button, output])

    # Initialize text
    with output:
        print_ex(your_list[slider.value])

    # Display the widgets
    display(ui)

# AMPS

In [None]:
with open(f"AMPS/data_jsonl/train/AMPS_00.jsonl") as f:
    ds = ndjson.load(f)

### Random sample analysis

In [None]:
shuffle_ds = ds
random.shuffle(shuffle_ds)
data_viewer(shuffle_ds)

### Length analysis

In [None]:
length_ds = sorted(ds, key= lambda x: len(x["text"]))
data_viewer(length_ds)

# ArXiv

In [None]:
shard = "00"

with open(f"arXiv/data_jsonl/train/arXiv_{shard}.jsonl") as f:
    ds = ndjson.load(f)

### Random sample analysis

In [None]:
shuffle_ds = ds
random.shuffle(shuffle_ds)
data_viewer(shuffle_ds)

### Length analysis

In [None]:
length_ds = sorted(ds, key= lambda x: len(x["text"]))
data_viewer(length_ds)

### Numerical density analysis

In [None]:
num_ds = sorted(ds, key= lambda x: numerical_density(x))
data_viewer(num_ds)

# The Stack Code

## Full Dataset Statistics

In [None]:
cumsize = 0
cumtokens = 0
with open("source_code/stack-code/stats.json") as f: 
    stats = json.load(f)
    
for key in stats:
    print(key.upper())
    tokens = stats[key]["tokens"]/10**9
    cumtokens += tokens
    print(f"tokens: {tokens:.4f} B")
    size = stats[key]["size"]/10**9
    cumsize += size
    print(f"size: {size:.4f} GB\n")

print("CUMULATIVE:")
print(f"tokens: {cumtokens:.4f} B")
print(f"size: {cumsize:.4f} GB\n")

In [None]:
pairs = [(key.title(), stats[key]["tokens"]) for key in stats]

pairs = sorted(pairs, key = lambda x: -x[1])

plt.bar([x[0] for x in pairs], [x[1] for x in pairs])
plt.ylabel('Tokens')
#plt.yscale('log')
plt.xticks(rotation=-90)
plt.show()

## Per-language analysis
Workflow for manually inspecting the quality of samples in a particular language. 

The stack has a ton of data quality issues so this is important

**Use the cell below to select language and shard**

In [None]:
!ls source_code/stack-code/train
lang = "agda"
shard = "0000"
print(f"selected {lang}{shard}")

In [None]:
with open(f"source_code/stack-code/train/{lang}{shard}.jsonl") as f: 
    ds = ndjson.load(f)

print("len: ", len(ds))

### Random Sample Analysis

In [None]:
shuffle_ds = ds
random.shuffle(shuffle_ds)
data_viewer(shuffle_ds)

### Length analysis

In [None]:
length_ds = sorted(ds, key= lambda x: len(x["text"]))
data_viewer(length_ds)

### Numerical density analysis

In [None]:
num_ds = sorted(ds, key= lambda x: numerical_density(x))
data_viewer(num_ds)

### Alphabetic density analysis

In [None]:
alpha_ds = sorted(ds, key= lambda x: alphabetic_density(x))
data_viewer(alpha_ds)

# Issues and Diffs

In [None]:
with open("issues_diffs/data_jsonl/train/issues_diffs.jsonl") as f:
    ds = ndjson.load(f)

### Random sample analysis

In [None]:
shuffle_ds = ds
random.shuffle(shuffle_ds)
data_viewer(shuffle_ds)

### Length analysis


In [None]:
length_ds = sorted(ds, key= lambda x: len(x["text"]))
data_viewer(length_ds)

### Numerical density analysis

In [None]:
num_ds = sorted(ds, key= lambda x: numerical_density(x))
data_viewer(num_ds)

# Stack Exchange

In [None]:
from tqdm import tqdm

ds = []
with open(f"stack_exchange/data_jsonl/stack_exchange.jsonl") as f: 
    for i, line in tqdm(enumerate(f.readlines())):
        ds.append(json.loads(line))

len(ds)

### Random sample analysis

In [None]:
shuffle_ds = ds
random.shuffle(shuffle_ds)
data_viewer(shuffle_ds)

### Length analysis

In [None]:
length_ds = sorted(ds, key= lambda x: len(x["text"]))
data_viewer(length_ds)