In [138]:
import ndjson
import random
import re
from functools import partial

**Problems with the stack**
- Issue: Matlab is wrong. There are only 111 matlab files that match the regex `[a-df-zA-Z]`. Looks like most of the matlab files are just arrays saved as text. Very little of the actual code was captured. 
    - [x] Fix 1: Regex filter to delete arrays
    - [ ] Fix 2: Find rest of matlab files
- Issue: The R data contains MacOS "resource fork" files that aren't related to R at all. 
    - [x] Fix: filter out resource forks
- Issue: Wolfram mathematica has three file formats:`.wls`: Wolfram language script, handled ok; `.m`Wolfram language package, handled ok; `.nb`: notebook, the plaintext has a bunch of noise. Need to export as `.wls`. 
    - [ ] Fix: convert notebooks to tex or wls
- There is one mathematica repo, `dendaxD/QAOA-MaxCut-amplitudes`, that contains about half of all mathematica files in the stack. All these files are extremely similar and should be included on data diversity grounds
    - [ ] Fix: filter out this repo. 

Languages the stack does ok:
- Lean is fine
- Julia is fine (possibly want to remove files that meet jsonl spec)
- Sage is fine (apart from stuff in problems to investigate)

Problems to investigate:
- .sagews files have a bunch of hashes all over the place. How to get rid of this?
- .sage files tend to have a bunch of long strings of hardcode numbers. Is this ok? e.g `ClathomasPrime/CompetitiveStableMatching:Plotting/plots.sage`

In [677]:
def matlab_rexp(example, rexp):
    return bool(h.search(example["content"]))

h = re.compile('[a-df-zA-Z]')
matlab_fix = partial(matlab_rexp, rexp=h)

def r_fix(example): 
    return "/* Resource fork" not in example["content"]

def mathematica_fix(example): 
    return example["max_stars_repo_name"] != "dendaxD/QAOA-MaxCut-amplitudes"

In [678]:
lang = "mathematica"
with open(f"stack-code/{lang}/0000000.jsonl") as f: 
    ds = ndjson.load(f)

print("before filter len: ", len(ds))
ds = list(filter(mathematica_fix, ds))
print("after filter len: ", len(ds))

before filter len:  41260
after filter len:  21810


In [581]:
i = 0 
random.shuffle(ds)

In [676]:
i += 1
text = ds[i]
print(i)
print(text["max_stars_repo_name"])
print(text["max_stars_repo_path"] + "\n" + "#"*40 + "\n")
print(text["content"])

95
dendaxD/QAOA-MaxCut-amplitudes
lines/1round/mathematica-files/16v2 1 1 1 3 2 3 2 1.nb
########################################

nqubits = 16;
name = "16v2 1 1 1 3 2 3 2 1";
nstates = 4;

amplitude[x_,y_] := (Exp[-15 I y] (1 (I Sin[x])^6 Cos[x]^10 + 1 (I Sin[x])^10 Cos[x]^6) + Exp[-13 I y] (7 (I Sin[x])^7 Cos[x]^9 + 7 (I Sin[x])^9 Cos[x]^7 + 3 (I Sin[x])^6 Cos[x]^10 + 3 (I Sin[x])^10 Cos[x]^6 + 8 (I Sin[x])^8 Cos[x]^8 + 1 (I Sin[x])^5 Cos[x]^11 + 1 (I Sin[x])^11 Cos[x]^5) + Exp[-11 I y] (11 (I Sin[x])^5 Cos[x]^11 + 11 (I Sin[x])^11 Cos[x]^5 + 45 (I Sin[x])^7 Cos[x]^9 + 45 (I Sin[x])^9 Cos[x]^7 + 2 (I Sin[x])^4 Cos[x]^12 + 2 (I Sin[x])^12 Cos[x]^4 + 21 (I Sin[x])^6 Cos[x]^10 + 21 (I Sin[x])^10 Cos[x]^6 + 52 (I Sin[x])^8 Cos[x]^8) + Exp[-9 I y] (109 (I Sin[x])^6 Cos[x]^10 + 109 (I Sin[x])^10 Cos[x]^6 + 226 (I Sin[x])^8 Cos[x]^8 + 185 (I Sin[x])^7 Cos[x]^9 + 185 (I Sin[x])^9 Cos[x]^7 + 38 (I Sin[x])^5 Cos[x]^11 + 38 (I Sin[x])^11 Cos[x]^5 + 9 (I Sin[x])^4 Cos[x]^12 + 9 (I Sin[x])^12 Cos

In [576]:
j = 84
print(ds[j]["max_stars_repo_name"])
print(ds[j]["max_stars_repo_path"])
print(ds[j]["content"])

ClathomasPrime/CompetitiveStableMatching
Plotting/plots.sage
# λ > [ (u,v) | (u, s) <- M.toList cov, v <- S.toList s ]
lattice = [(0,1),(0,2),(0,3),(0,4),(1,5),(1,6),(1,7),(2,5),(2,8),(2,9),(3,6),(3,8),(3,10),(4,7),(4,9),(4,10),(5,11),(5,12),(5,13),(5,14),(6,13),(6,15),(7,14),(7,15),(8,13),(8,16),(9,14),(9,16),(10,15),(10,16),(10,17),(10,18),(11,19),(11,20),(11,21),(12,19),(12,22),(12,23),(13,20),(13,22),(13,24),(14,21),(14,23),(14,24),(15,24),(15,25),(15,26),(16,24),(16,27),(16,28),(17,25),(17,27),(17,29),(18,26),(18,28),(18,29),(19,30),(19,31),(19,32),(19,33),(20,32),(20,34),(21,33),(21,34),(22,32),(22,35),(23,33),(23,35),(24,34),(24,35),(24,36),(24,37),(25,36),(25,38),(26,37),(26,38),(27,36),(27,39),(28,37),(28,39),(29,38),(29,39),(29,40),(29,41),(30,42),(30,43),(30,44),(31,43),(31,45),(31,46),(32,44),(32,45),(32,47),(33,42),(33,46),(33,47),(34,47),(34,48),(34,49),(35,47),(35,50),(35,51),(36,48),(36,50),(36,52),(37,49),(37,51),(37,52),(38,52),(38,53),(38,54),(39,52),(39,55),(39,56),

In [101]:
bool(matlab_filter(ds[2]))

False