# Anonymizing languages' writing systems
For each language's extracted text file (or more precisely a portion I have chosen for the train dataset), I compute how common each character is. The most common character corresponds to 0, second most common to 1 and so on. The space character is not transformed; it's still just space. Then, I replace the characters with their number correspondances.

In [136]:
import pandas as pd
import pickle
import sys # to get max int
import os

In [138]:
inpath = './data_chunked/'
outpath = './data/chunks-anon/'
files = ['en.txt']

In [110]:
# assigns each character to a number. space is always 0. then, if the most common non-space
# char is found to be 'e', then e gets assigned 1. if 't' is second most common, t is given 2.
def make_transform(chunks):
    char_dict = {}
    char_dict[' '] = sys.maxsize
    # index 0 is reserved character for space, corresponds to "most common"
    
    for s in chunks:
        for c in s: # iterate over ea char in ea chunk
            if c == ' ' or c == '\n':
                continue
            char_dict[c] = char_dict.get(c, 0) + 1 # find num occurences of ea char
    
    df = pd.DataFrame.from_dict(char_dict, orient='index', columns=['occur'])
    df.sort_values(by='occur', inplace=True, ascending=False) # sort by num occurrences
    df.reset_index(inplace=True)
    df.columns = ['char', 'occur']
    df['ind'] = df.index # space will be index 0, most common char index 1, etc
    df.set_index('char', inplace=True) # set index back to the characters
    del df['occur'] # delete the num occurences
    trans = df.to_dict()['ind'] # dict of char to word common-ness
    return df, trans

In [132]:
# using the supplied transformation, transform ea char in the text
def apply_transform(chunks, trans):
    chunks_trans = []
    
    for i,chunk in enumerate(chunks):
        chunkarr = list(chunk)[:-1] # strip off newline \n char at very end
        for j,c in enumerate(chunkarr):
            chunkarr[j] = trans[c]
        chunks_trans.append(chunkarr)
    
    return chunks_trans

In [139]:
# iterate over ea lang, make and apply transform, save to file
for f in os.listdir(inpath):
    if f == 'README.md' or f.startswith('.'): continue
    print(f)
    file = open(inpath+f, 'r')
    chunks = file.readlines()
    file.close()
    
    df, trans = make_transform(chunks)
    chunks_trans = apply_transform(chunks, trans)
    
    file = open(outpath+f, 'wb')
    pickle.dump(chunks_trans, file)
    file.close()

kaa.txt
glk.txt
el.txt
gd.txt
io.txt
myv.txt
sv.txt
sa.txt
sw.txt
pl.txt
mni.txt
fa.txt
bjn.txt
eml.txt
koi.txt
kab.txt
rue.txt
eo.txt
mzn.txt
tcy.txt
su.txt
diq.txt
sc.txt
shn.txt
azb.txt
ang.txt
ja.txt
hi.txt
jv.txt
tyv.txt
en.txt
gor.txt
gan.txt
lez.txt
lij.txt
pag.txt
pap.txt
pms.txt
mwl.txt
gu.txt
ka.txt
kv.txt
awa.txt
sq.txt
ru.txt
kw.txt
ace.txt
nqo.txt
ga.txt
gv.txt
fr.txt
hy.txt
ku.txt
sd.txt
skr.txt
rw.txt
sr.txt
se.txt
hsb.txt
vls.txt
xal.txt
inh.txt
udm.txt
lad.txt
krc.txt
co.txt
ms.txt
zea.txt
tg.txt
jbo.txt
vo.txt
min.txt
ceb.txt
la.txt
lv.txt
om.txt
mr.txt
cy.txt
af.txt
nap.txt
war.txt
wuu.txt
as.txt
bh.txt
lt.txt
mg.txt
sat.txt
tr.txt
te.txt
ts.txt
dty.txt
lb.txt
crh.txt
ar.txt
szl.txt
av.txt
ny.txt
tpi.txt
nn.txt
scn.txt
ary.txt
hyw.txt
ur.txt
vi.txt
ta.txt
sco.txt
stq.txt
mt.txt
no.txt
lg.txt
ban.txt
ab.txt
bn.txt
zh.txt
ug.txt
arz.txt
wo.txt
tt.txt
nl.txt
bo.txt
szy.txt
README.md
bug.txt
an.txt
ay.txt
li.txt
or.txt
nv.txt
uk.txt
tn.txt
yi.txt
cdo.txt
sah.txt
ml.txt
o