In [2]:
import numpy as np
import pandas as pd

In [45]:
def reformat(infile, outfile, dictfile):
    
    raw = pd.read_csv(infile, delimiter='\t', names=["1gram", "year", "occ"], 
                      converters={"1gram":lambda x: str(x), "year":lambda x: int(x), "occ":lambda x: int(x)},
                      engine='c', quoting=3, encoding='latin-1')
    
    # break up each word into its own sub-array
    break_inds = np.where(raw["1gram"] != np.roll(raw["1gram"], 1))[0][1:]
    
    # only use data from words without "wacky" characters
    usable_inds = []
    for index in break_inds:
        if raw["1gram"][index] == raw["1gram"][index].encode("latin-1").decode("utf-8"):
            usable_inds.append(index)
    
    if outfile:
        # writing the counts for each word
        years = np.array(raw["year"])
        counts = np.array(raw["occ"])

        years_each_word = np.split(years, usable_inds)
        counts_each_word = np.split(counts, usable_inds)
    
        # fill in zeroes on skipped years
        #min_year = np.min(years)
        min_year = 1800
        max_year = np.max(years)

        full_years = np.arange(min_year, max_year+1)

        # axes (word, year)
        # try filling with 1 not 0...
        full_counts_each_word = np.zeros((len(counts_each_word), full_years.size), dtype=np.int32)

        # not ideal, but only have to run this once...
        for i in range(len(years_each_word)):
            # for every word's subarray

            for j in range(len(years_each_word[i])):
                # for every count in that subarray
                if years_each_word[i][j] < min_year:
                    continue
                ind = np.where(years_each_word[i][j] == full_years)[0][0]
                full_counts_each_word[i, ind] = counts_each_word[i][j]

        # write counts
        with open(outfile, 'ba') as f:
            np.savetxt(f, full_counts_each_word, fmt='%u', delimiter=',')
    
    if dictfile:

        unique_words = np.array(raw["1gram"])[usable_inds]
        unique_words = np.hstack((raw["1gram"].head(1), unique_words))
        
        print(unique_words)

        # write words to ordered "dictionary"
        with open(dictfile, 'a+') as f:
            for word in unique_words:
                f.write(word)
                f.write('\n')

In [46]:
outfile = "../ngram_data/full_counts_1800.csv"
dictfile = "../ngram_data/py_dict_1800.txt"
for i in range(10):
    name = f"../ngram_data/googlebooks-eng-1M-1gram-20090715-occonly{i}.csv"
    reformat(name, False, dictfile)
    print(i)

['#' '$0.000' '$0.003' ... 'zzy' 'zzz' 'zzzzzzzz']
0
['$' '$0.001' '$0.011' ... 'zzl' 'zzviii' 'zzzzzzz']
1
['$0.007' '$0.0075' '$0.026' ... 'zz8' 'zzs' 'zzzzz']
2
['$0.00' '$0.006' '$0.01' ... 'zzx' 'zzzii' '{']
3
['$0.002' '$0.004' '$0.008' ... 'zzt' 'zzzi' 'zzziii']
4
['$0.0' '$0.057' '$0.058' ... 'zzo' 'zzxiii' 'zzzzzzzzzz']
5
['$0.0005' '$0.009' '$0.013' ... 'zythos' 'zziz' 'zzzz']
6
['$0' '$0.015' '$0.023' ... 'zzr' 'zzzzzzzzz' '|']
7
['"""ê\x11"' '$0.0025' '$0.005' ... 'zzled' 'zzvi' 'zzxi']
8
['!' '$0.017' '$0.042' ... 'zzzzzz' '}' '~']
9
