# Download and prepare NGram Data

!<br>To use Googles fast Internet, upload the notebook to google colab and run the cell below to connect google drive

In [1]:
import googleapiclient
import pydrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import drive
from oauth2client.client import GoogleCredentials
from os import chdir

TARGET_DIR = 'PATH'

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
my_drive = GoogleDrive(gauth)
drive.mount('/content/gdrive')
chdir(TARGET_DIR)

Mounted at /content/gdrive


## Load packages

In [2]:
from tqdm.notebook import tqdm # for ipynb
import urllib.request
import pandas as pd
import gzip
import os
import pickle
import re
from bs4 import BeautifulSoup
from datetime import datetime
import gc
import multiprocessing as mp

## Define helper functions

In [3]:
def save_obj(obj, name):  # save python objects
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
 
 
def load_obj(name):  # load python objects
    with open(name, 'rb') as f:
        return pickle.load(f)

## Set language and words to extract

### Define Pattern to extract

In [4]:
import re
p_target = re.compile(r'wellbeing_noun=>\w+_adj')

## Get NGram Links

Get Links<br>
<b>Set Language Here</b>

In [8]:
# Set subcorpus here:
# change tag after "books/20200217/"
# Twice for the url, once for the pattern
# eng or eng-fiction or eng-us
list_url = f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-0-ngrams_exports.html'  # url to list with links
p_gzfile = re.compile(r'http://storage\.googleapis\.com/books/ngrams/books/20200217/eng/0-00841-of-00857\.gz')  # pattern for links to gzip files

# Downloads all links to the gzip files with starting ngram
soup = BeautifulSoup(urllib.request.urlopen(list_url).read(), 'html.parser')
links = [(a.text, a['href']) for a in soup.find_all('a') if p_gzfile.search(a['href'])]

print('\n'.join([' - '.join(link) for link in links[:5]]))
print(len(links))

weather_VERB=>gap - http://storage.googleapis.com/books/ngrams/books/20200217/eng/0-00841-of-00857.gz
1


# Download and Extraction

## Function

In [9]:
def download_extract(process_count, links):
    """
    This function downloads the files needed for the analysis and extracts the data
    """

    print(f'Process {process_count} started...')

    amt_files_to_dl = len(links)
    downloaded = 0
    # lines_parsed = 0

    
    # load csv with ngram counts if it already exists, otherwise create empty
    if os.path.isfile(f'ngram_checkpoint_Process{process_count}.csv'):
        ngram_df = pd.read_csv(f'ngram_checkpoint_Process{process_count}.csv')
    else:
        ngram_df = pd.DataFrame({
        'ngram': pd.Series([], dtype='str'),
        'year': pd.Series([], dtype='int'),
        'count': pd.Series([], dtype='int')
      })
    
    if os.path.isfile(f'completed_files_Process{process_count}.pkl'):
        completed_files = load_obj(f'completed_files_Process{process_count}.pkl')
    else:
        completed_files = []
    
    # Loop over links
    # start = datetime.now()
    for entry in links:
        link = entry[1]
        ngramfile = link.split('/')[-1]
        downloaded += 1
        linkstart = datetime.now()

        if ngramfile in completed_files:
        #   print(f'Process {process_count}: File {ngramfile} already downloaded and extracted\nContinuing...\n')
          continue

        # Get file size and show progress
        # site = urllib.request.urlopen(link)
        # size = site.info()["Content-Length"]
        # del site
        # gc.collect()
        # mbsize = round(int(size)/1000000, 2)
        # print(f'File: "{ngramfile}" (No. {downloaded}/{amt_files_to_dl}; Size: ~{mbsize} MB)')
        print(f'Process {process_count}: File: "{ngramfile}" (No. {downloaded}/{amt_files_to_dl})')

        # Download file
        # print('\tDownloading ...')
        urllib.request.urlretrieve(link, ngramfile)

        # Extract information
        # print('\tExtracting ...')

        # if any([word == ngram for ngram in ngrams]): pass
        # open ngram file (gzip) 
        relevant_entries = []
        with gzip.open(ngramfile, mode='rb') as infile:
            # lines_in_file = sum([1 for line in infile.readlines()])
            # infile.seek(0)  # Reset readlines() method
            # lines_in_file = 3000000  # estimation, more like 3.5e6
            for line in infile.readlines():
                splitline = line.decode('utf-8').split('\t')
                ngram = splitline[0].lower()
                if p_target.search(ngram):  # None converts to false, match to true
                  for year in splitline[1:]:
                    relevant_entries.append([ngram, *map(int, year.split(',')[:2])])
                    # ngram_df.loc[len(ngram_df)] = [ngram, *map(int, year.split(',')[:2])]
        new_ngrams = pd.DataFrame.from_records(relevant_entries, columns=["ngram", "year", "count"])
        ngram_df = pd.concat([ngram_df, new_ngrams])
        ngram_df.to_csv(f'ngram_checkpoint_Process{process_count}.csv', index=False)
        # lines_parsed += lines_in_file + 4e5  # lower number for tqdm, 3.4e6 is closer
        print(f'Process {process_count}: {ngramfile} finished in {datetime.now() - linkstart}')
        # print(f'\tTime for file: {datetime.now() - linkstart}')
        # print(f'\tAbout {lines_parsed} lines parsed in {datetime.now() - start}\n\n')
        # print(f'\tTotal Time: {datetime.now() - start}\n\n')

    
        completed_files.append(ngramfile)
        save_obj(completed_files, f'completed_files_Process{process_count}.pkl')
        os.remove(ngramfile)  # remove downloaded file
        gc.collect()

        # Careful: Removes all files in trash (f"title = '{ngramfile}' and trashed=true")
        viable_errors = (FileNotFoundError, googleapiclient.errors.HttpError, pydrive.files.ApiRequestError, pydrive.settings.InvalidConfigError)
        for file_to_remove in my_drive.ListFile({'q': "trashed=true"}).GetList():
          try:
            file_to_remove.Delete()
          except viable_errors as e:
            print(f"Error in Removing file {ngramfile} in Process {process_count}")
            try:
              file_to_remove.Delete()
            except viable_errors as e:
              pass

## Run Download and extraction

In [10]:
# download_extract(assigned)
NUM_PROCESSES = 6
 
share, rest = divmod(len(links), NUM_PROCESSES)
ngram_shares = []
for i in range(0, NUM_PROCESSES-1):
  ngram_shares.append(links[share*i:share*(i+1)])
ngram_shares.append(links[-(share+rest):])

processes = []
for i, share in enumerate(ngram_shares):
  processes.append(mp.Process(target=download_extract, args=(i+1, share, )))
 
for process in processes:
  process.start()
 
for process in processes:
  process.join()
 
print('_'*30 + '\n\nDONE')

Process 1 started...
Process 2 started...
Process 3 started...
Process 4 started...
Process 5 started...
Process 6 started...
Process 6: File: "0-00841-of-00857.gz" (No. 1/1)
Process 6: 0-00841-of-00857.gz finished in 0:00:41.397126
______________________________

DONE
