In [4]:
import ndjson
import json
import random
import re
from functools import partial
from tqdm import tqdm
import matplotlib.pyplot as plt

# The Stack Code

Stack stats:

In [2]:
cumsize = 0
cumtokens = 0
with open("stack-code/stats.json") as f: 
    stats = json.load(f)
    
for key in stats:
    print(key.upper())
    tokens = stats[key]["neox_tokens"]/10**9
    cumtokens += tokens
    print(f"tokens: {tokens:.4f} B")
    size = stats[key]["size"]/10**9
    cumsize += size
    print(f"size: {size:.4f} GB\n")

print("CUMULATIVE:")
print(f"tokens: {cumtokens:.4f} B")
print(f"size: {cumsize:.4f} GB\n")



MATLAB
tokens: 0.0001 B
size: 0.0003 GB

JULIA
tokens: 0.6699 B
size: 1.7519 GB

R
tokens: 0.1243 B
size: 0.2889 GB

SAGE
tokens: 0.0063 B
size: 0.0148 GB

MATHEMATICA
tokens: 0.8235 B
size: 1.7000 GB

MAPLE
tokens: 0.0085 B
size: 0.0154 GB

GAP
tokens: 0.0053 B
size: 0.0126 GB

LEAN
tokens: 0.0695 B
size: 0.1628 GB

ISABELLE
tokens: 0.0393 B
size: 0.0989 GB

PYTHON
tokens: 6.8227 B
size: 21.0366 GB

C
tokens: 0.0254 B
size: 0.0680 GB

C++
tokens: 1.3958 B
size: 4.2658 GB

TEX
tokens: 0.6872 B
size: 2.1816 GB

CUMULATIVE:
tokens: 10.6778 B
size: 31.5977 GB



In [None]:
pairs = [(key.title(), stats[key]["neox_tokens"]) for key in stats]

pairs = sorted(pairs, key = lambda x: -x[1])

plt.bar([x[0] for x in pairs], [x[1] for x in pairs])
plt.ylabel('Tokens')
plt.yscale('log')
plt.xticks(rotation=-60)
plt.show()

**Problems with the stack**
- Issue: Matlab is wrong. There are only 111 matlab files that match the regex `[a-df-zA-Z]`. Looks like most of the matlab files are just arrays saved as text. Very little of the actual code was captured. 
    - [x] Fix 1: Regex filter to delete arrays
    - [ ] Fix 2: Find rest of matlab files
- Issue: The R data contains MacOS "resource fork" files that aren't related to R at all. 
    - [x] Fix: filter out resource forks
- Issue: .sagews files have a bunch of hashes all over the place.
    - [ ] Fix: figure out how to delete hashes, or render notebooks. 
- Issue: .sage files tend to have a bunch of long strings of hardcode numbers. Is this ok? e.g `ClathomasPrime/CompetitiveStableMatching:Plotting/plots.sage`
- Issue: Wolfram mathematica has three file formats:`.wls`: Wolfram language script, handled ok; `.m`Wolfram language package, handled ok; `.nb`: notebook, the plaintext has a bunch of noise. Need to export as `.wls`. 
    - [ ] Fix: convert notebooks to tex or wls
- Issue: There is one mathematica repo, `dendaxD/QAOA-MaxCut-amplitudes`, that contains about half of all mathematica files in the stack. All these files are extremely similar and should be included on data diversity grounds
    - [x] Fix: filter out this repo. 
- Issue: Some maple files are actually xml
    - [x] Fix: filter out xml
- Issue: Lots of auto-generated tex files in directories called `latex`. 
    - [x] Fix: remove these

Languages the stack does ok:
- Lean is fine
- Julia is fine (possibly want to remove files that meet jsonl spec)
- Python is clean (maybe get rid of Chinese characters?)

I'm not sure if my C/C++ filtering is good at all. Am I getting too many `.h` files?

Do we want Chinese in our Python?

Another issue to consider: Non-latin characters, e.g Chinese

In [1265]:
lang = "python"
with open(f"stack-code/{lang}/0000000.jsonl") as f: 
    ds = ndjson.load(f)

print("len: ", len(ds))

len:  100000


In [1276]:
def print_ex(example): 
    print(text["max_stars_repo_name"])
    print(text["max_stars_repo_path"] + "\n" + "#"*40 + "\n")
    print(text["content"])

In [1266]:
i = 0 
random.shuffle(ds)

In [1277]:
i += 1
text = ds[i]
print(i)
print_ex(text)

10
samesense/chilin
software/mdseqpos/lib/pwmclus_motif_comp.py
########################################

#!/usr/bin/env python
#coding: utf-8

"""Reference: http://nar.oxfordjournals.org/content/early/2013/12/23/nar.gkt1302.full

Created by: Jian Ma 2014-03-15
Modiied by: Jian Ma 2014-04-14
"""
import math
import numpy
import mdseqpos
#import MotifParser as mp

def sum_IC(m1):
    """sum of IC of each column in the matrix
    """
    ic = [IC(t) for t in m1]
    return sum(ic)

def IC(v1):
    """IC of a vector
    """
    total = sum(v1)
    v1 = [t * 1.0 / total for t in v1]
    return 2 + sum([t * math.log(t, 2) for t in v1])

def pcc_vector(v1, v2):
    """Pearson Correlation Coefficient for 2 vectors
    """
    len1 = len(v1)
    len2 = len(v2)
    if len1 != len2:
        return None
    else:
        length = len1
    avg1 = 1.0 * sum(v1) / len(v1)
    avg2 = 1.0 * sum(v2) / len(v2)
    dxy = [(v1[i] - avg1) * (v2[i] - avg2) for i in range(length)]
    dx2 = [(v1[i] - avg1) **

In [1263]:
j = 10
print(ds[j]["max_stars_repo_name"])
print(ds[j]["max_stars_repo_path"])
print(ds[j]["content"])

nicolair/maths-cours
C2195.tex
\input{courspdf.tex}
\debutcours{Approximations des zéros d'une fonction}{alpha}

L'approximation des zéros (ou racines) d'une fonction comporte deux temps : la séparation des racines et l'approximation proprement dite.\newline
La séparation des racines consiste à former des intervalles sur lesquels la restriction de la fonction a de bonnes propriétés et admet une seule racine. Les méthodes proposées ici ne portent que sur les manières de former des valeurs approchées de l'unique zéro dans l'intervalle considéré.\newline
Dans les trois cas, on supposera que la fonction est strictement croissante sur un intervalle $[a,b]$ avec $f(a)<0$ et $f(b)>0$.

\section{Dichotomie}
La méthode de dichotomie repose sur le diagramme suivant et se met en oeuvre très facilement informatiquement. Il est à noter que l'on dispose automatiquement d'une majorations de l'erreur car après $n$ itérations, la racine est entre $a$ et $b$ avec 
\begin{displaymath}
 0<b-a=\frac{b-a}{2

In [1278]:
for x in ds: 
    if x["ext"]=="ipynb":
        print_ex(x)