# Final data prep

(NEW CONTINUING)

This is the second notebook in the project

Basically, all the data was gathered already (that process was shown in the previous notebook). So, this code takes care of nearly everything in between the data gathering and the actual machine learning. I create a version of the data with the languages' writing systems anonymized, and export both the anonymized and non-anonymized data to files that can be read in in the ML notebook.

## Tabel of contents
- [Anonymizing languages' writing systems](#Anonymizing-languages'-writing-systems)
- [Prepare data for ML](#Prepare-data-for-ML)

In [1]:
import pandas as pd
import pickle
import sys # to get max int
import os

## Anonymizing languages' writing systems

For each language's extracted text file (or more precisely a portion I have chosen for the train dataset), I compute how common each character is. The most common character corresponds to 0, second most common to 1 and so on. The space character is not transformed; it's still just space. Then, I replace the characters with their number correspondances.

In [2]:
inpath = './data/chunks-nonanon/'
outpath = './data/chunks-anon/'
files = ['hr.txt'] #sample list for debugging

In [3]:
# assigns each character to a number. space is always 0. then, if the most common non-space
# char is found to be 'e', then e gets assigned 1. if 't' is second most common, t is given 2.
def make_transform(chunks):
    char_dict = {}
    char_dict[' '] = sys.maxsize
    # index 0 is reserved character for space, corresponds to "most common"
    # you get ind 0 by being having the greatest occurences, so that's max_int
    
    for s in chunks:
        for c in s: # iterate over ea char in ea chunk
            if c == ' ' or c == '\n':
                continue
            char_dict[c] = char_dict.get(c, 0) + 1 # find num occurences of ea char
    
    df = pd.DataFrame.from_dict(char_dict, orient='index', columns=['occur'])
    df.sort_values(by='occur', inplace=True, ascending=False) # sort by num occurrences
    df.reset_index(inplace=True)
    df.columns = ['char', 'occur']
    df['ind'] = df.index # space will be index 0, most common char index 1, etc
    df = df[df['ind']<256] # TEMPORARY FIX
    df.set_index('char', inplace=True) # set index back to the characters
    del df['occur'] # delete the num occurences
    trans = df.to_dict()['ind']
    return df, trans

In [4]:
# using the supplied transformation, transform ea char in the text
def apply_transform(chunks, trans):
    chunks_trans = []
    
    for i,chunk in enumerate(chunks):
        chunkarr = list(chunk)[:-1] # strip off newline \n char at very end
        for j,c in enumerate(chunkarr):
            chunkarr[j] = trans.get(c, 255)
        chunks_trans.append(bytes(chunkarr))
    
    return chunks_trans

In [5]:
# iterate over ea lang, make and apply transform, save to file
for f in os.listdir(inpath):
    if f == 'README.md' or f.startswith('.'): continue
#     print(f)
    file = open(inpath+f, 'r')
    chunks = file.readlines()
    file.close()
    
    df, trans = make_transform(chunks)
    chunks_trans = apply_transform(chunks, trans)

    file = open(outpath+f, 'wb')
    pickle.dump(chunks_trans, file)
    file.close()

Example of an anonymized chunk:

In [6]:
chunks_trans[0]

b'\x0c\x06\x05\x00\x02\x03\t\x01\x00\x08\x02\x05\x01\x03\x00\x07\x01\x00\x04\x0e\r\x02\x08\x06\x03\x00\x12\x08\x04\t\x02\x03\x00\x02\x00\x04\t\x01\x07\x04\x01\x05\x00\x04\x01\x00\x10\x01\x05\x04\x01\x00\x0c\x06\x05\t\x08\x06\x07\x01\x00\n\x02\x00\r\x01\x08\t\x02\x03\x00\n\x02\x00\x07\x01\x00\x0c\x06\x03\t\x01\x00\x02\x00\x04\x01\x00\x04\x05\x03\t\x04\t\x0b\x04\x00\x07\x01\x00\x0c\x06\x07\x06\x05\x04\x01\x03\x00\x12\x08\x04\t\x02\x03\x00\x02\x00\x04\t\x01\x07\x04\x01\x05\x00\n\x02\x00\x17\x06\x0e\x01\x07\x04\x07\x01\x05\n\x00\x1f\x02\x00\x17\x06\x0e\x01\x07\x04\x01\x00\x05\x06\x00\x04\x01\x00\x01\x0f\x02\x00\x0b\x05\x00\x10\x06\x0f\x02\x08\x05\x01\x00\x0c\x0b\x01\x07\x00\x0c\x06\x05\t\x08\x06\x07\x01\x00\x07\x01\x00\r\x01\x04\x03\x00\x04\x05\t\x02\x08\x01\x00\x15\x05\x00\x07\x01\x00\x05\x06\x08\n\x02\x00\x07\x01\x00\x08\x02\x13\x04\x06\x05\x00\n\x02\x00\x17\x06\x0e\x01\x07\x04\x07\x01\x05\n\x00\x04\x01\x00\n\x02\x0c\x07\x01\x08\x01\x00\x01\x0b\t\x06\x05\x06\x0e\x04\x01\x00\x02\x05\x00\x

## Prepare data for ML
Each language is represented in its own file. This code reads all the files and puts them into one dataframe. Works for anon or non-anon.

In [7]:
# inpath is file location of input, outpath of output, and anon is bool whether data is
# anonymized or not
def prepdata(inpath, outfile, anon):
    dfs = []
    data = pd.DataFrame([], columns=['text', 'lang'])
    readmode = 'r'
    if anon: readmode = 'rb'
    for f in os.listdir(inpath):
        if f == 'README.md' or f.startswith('.'): continue
#         print(f)
        file = open(inpath+f, readmode) #rb if A, r if NA
        chunks = [] # pickle load if A or readlines if NA
        if anon: chunks = pickle.load(file)
        else: chunks = file.readlines()
        file.close()
        lang = f[:f.index('.')]
        df = pd.DataFrame([chunks, len(chunks)*[lang]], index=['text', 'lang']).T
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    dfs = None #clear memory
    if not anon: df.text = df.text.apply(lambda x: x[:-1])
    f = open(outfile, 'wb')
    pickle.dump(df, f)
    f.close()
    print('anon:', anon, '\tsanity check:', df.shape[0], 'chunks.', 
         df['text'].map(lambda x: len(x)!=500).sum(), 'not with 500 chars')

In [8]:
# anon
inpath = './data/chunks-anon/'
outfile= './data/chunks_shufanon.pkl'
prepdata(inpath=inpath, outfile=outfile, anon=True)

anon: True 	sanity check: 1914517 chunks. 123 not with 500 chars


In [9]:
# nonanon
inpath = './data/chunks-nonanon/'
outfile= './data/chunks.pkl'
prepdata(inpath, outfile, False)

anon: False 	sanity check: 1914517 chunks. 123 not with 500 chars
