In [2]:
import numpy as np
import pandas as pd

In [7]:
def reformat(infile, outfile, dictfile):
    
    raw = pd.read_csv(infile, delimiter='\t', names=["1gram", "year", "occ"], 
                      converters={"1gram":lambda x: str(x), "year":lambda x: int(x), "occ":lambda x: int(x)},
                      engine='c', quoting=3, encoding='latin-1')
    
    # break up each word into its own sub-array
    break_inds = np.where(raw["1gram"] != np.roll(raw["1gram"], 1))[0][1:]
    
    years = np.array(raw["year"])
    years_each_word = np.array(np.split(years, break_inds))
    
    # only use data from words without "wacky" characters and that have appeared continuously
    usable_inds = []
    for index in range(years_each_word.size):
        try:
            if raw["1gram"][index] == raw["1gram"][index].encode("latin-1").decode("utf-8"):
                if (years_each_word[index] - np.roll(years_each_word[index], 1)).all() < 2:
                    usable_inds.append(index)
        except UnicodeDecodeError:
            print(f'WARNING - unicode decode error, ignoring word {index}...')

    years_each_word = years_each_word[usable_inds]
    
    if outfile:
        # writing the counts for each word
 
        counts = np.array(raw["occ"])
        counts_each_word = np.array(np.split(counts, break_inds))
        counts_each_word = counts_each_word[usable_inds]
    
        # fill in zeroes on skipped years
        #min_year = np.min(years)
        min_year = 1800
        max_year = np.max(years)

        full_years = np.arange(min_year, max_year+1)

        # axes (word, year)
        # try filling with 1 not 0...
        full_counts_each_word = np.zeros((len(counts_each_word), full_years.size), dtype=np.int32)

        # not ideal, but only have to run this once...
        for i in range(len(years_each_word)):
            # for every word's subarray

            for j in range(len(years_each_word[i])):
                # for every count in that subarray
                
                if years_each_word[i][j] < min_year:
                    # never worried about greater than max_year
                    continue
                ind = np.where(years_each_word[i][j] == full_years)[0][0]
                full_counts_each_word[i, ind] = counts_each_word[i][j]

        # write counts
        with open(outfile, 'ba') as f:
            np.savetxt(f, full_counts_each_word, fmt='%u', delimiter=',')
    
    if dictfile:

        unique_words = np.array(raw["1gram"])[usable_inds]
        unique_words = np.hstack((raw["1gram"].head(1), unique_words))

        # write words to ordered "dictionary"
        with open(dictfile, 'a+') as f:
            for word in unique_words:
                f.write(word)
                f.write('\n')

In [6]:
outfile = "../ngram_data/full_counts_1800.csv"
dictfile = "../ngram_data/py_dict_1800.txt"
for i in range(10):
    name = f"../ngram_data/googlebooks-eng-1M-1gram-20090715-occonly{i}.csv"
    reformat(name, outfile, dictfile)
    print(i)

0
1
2
3
4
5
6
7


8
9
