In [2]:
import numpy as np
import pandas as pd

In [3]:
def reformat(infile, outfile, dictfile):
    
    raw = pd.read_csv(infile, delimiter='\t', names=["1gram", "year", "occ"], 
                      converters={"1gram":lambda x: str(x), "year":lambda x: int(x), "occ":lambda x: int(x)},
                      engine='c', quoting=3, encoding='latin-1')
    
    # break up each word into its own sub-array
    break_inds = np.where(raw["1gram"] != np.roll(raw["1gram"], 1))[0]
    
    years = np.array(raw["year"])
    years_each_word = np.array(np.split(years, break_inds[1:]))
    
    all_words = np.array(raw["1gram"])[break_inds]
    print(all_words[0])
    print(all_words[-1])
    
    # only use data from words without "wacky" characters and that have appeared continuously
    usable_inds = []
    for index in range(years_each_word.size):
        try:
            if all_words[index] == all_words[index].encode("latin-1").decode("utf-8"):
                if (years_each_word[index] - np.roll(years_each_word[index], 1) < 2).all():
                    if years_each_word[index].min() <= 1800 and years_each_word[index].max() == 2008:
                        usable_inds.append(index)
        except UnicodeDecodeError:
            print(f'WARNING - unicode decode error, ignoring word {index}...')

    years_each_word = years_each_word[usable_inds]
    
    if outfile:
        # writing the counts for each word
 
        counts = np.array(raw["occ"])
        counts_each_word = np.array(np.split(counts, break_inds[1:]))
        counts_each_word = counts_each_word[usable_inds]
    
        # axes (word, year)
        # try filling with 1 not 0...
        full_counts_each_word = np.vstack([counts_each_word[index][:209] for index in range(len(usable_inds))])

        # write counts
        with open(outfile, 'ba') as f:
            np.savetxt(f, full_counts_each_word, fmt='%u', delimiter=',')
    
    if dictfile:

        usable_words = all_words[usable_inds]

        # write words to ordered "dictionary"
        with open(dictfile, 'a+') as f:
            for word in usable_words:
                f.write(word)
                f.write('\n')

In [4]:
outfile = "../ngram_data/full_counts_1800_contonly.csv"
dictfile = "../ngram_data/py_dict_1800_contonly.txt"
for i in range(10):
    name = f"../ngram_data/googlebooks-eng-1M-1gram-20090715-occonly{i}.csv"
    reformat(name, outfile, dictfile)
    print(i)

#
ï¬ow
0
$
ï¬ourished
1
$0.007
ï¼
2
$0.00
ï¼
3
$0.002
ï¬y
4
$0.0
ï¬ying
5
$0.0005
ï¬ight
6
$0
ï¬xing
7
"""ê"
ï¬oating
8
!
ï¬uid
9
