# Download and prepare NGram Data

!<br>To use Googles fast Internet, upload the notebook to google colab and run the cell below to connect google drive

In [1]:
import googleapiclient
import pydrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import drive
from oauth2client.client import GoogleCredentials
from os import chdir

TARGET_DIR = 'PATH'

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
my_drive = GoogleDrive(gauth)
drive.mount('/content/gdrive')
chdir(TARGET_DIR)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load packages

In [2]:
import gzip
import os
import pickle
import re
import gc
import urllib.request
import numpy as np
import pandas as pd
import multiprocessing as mp
from tqdm.notebook import tqdm # for ipynb
from bs4 import BeautifulSoup
from datetime import datetime
from typing import Any

## Define helper functions

In [3]:
def save_obj(obj: Any, name: str) -> None:
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name: str) -> Any:
    with open(name, 'rb') as f:
        return pickle.load(f)

## Set language and words to extract

### Load words to extract

In [4]:
target_with_count: pd.DataFrame = pd.read_csv('./data/ngram_dependencies_wellbeing_separated.csv')  
target_with_count

Unnamed: 0,ngram,noun,adjective,year,count
0,wellbeing_noun=>gross_adj,wellbeing,gross,1916,3
1,wellbeing_noun=>gross_adj,wellbeing,gross,1971,4
2,wellbeing_noun=>gross_adj,wellbeing,gross,1972,3
3,wellbeing_noun=>gross_adj,wellbeing,gross,1973,4
4,wellbeing_noun=>gross_adj,wellbeing,gross,1980,4
...,...,...,...,...,...
26436,wellbeing_noun=>worse_adj,wellbeing,worse,2015,5
26437,wellbeing_noun=>worse_adj,wellbeing,worse,2016,1
26438,wellbeing_noun=>worse_adj,wellbeing,worse,2017,3
26439,wellbeing_noun=>worse_adj,wellbeing,worse,2018,11


In [5]:
targets: list[str] = list(target_with_count['adjective'].unique())  # only labels are needed
print(targets[:10])
print(len(targets))

['gross', 'ongoing', 'hedonic', 'ecological', 'western', 'critical', 'health', 'somatic', 'marvelous', 'hygienic']
437


#### As Ngrams

In [6]:
ngrams: list[str] = [fr'{x}_adj=>\w+_adj' for x in targets]
print(ngrams[:10])

['gross_adj=>\\w+_adj', 'ongoing_adj=>\\w+_adj', 'hedonic_adj=>\\w+_adj', 'ecological_adj=>\\w+_adj', 'western_adj=>\\w+_adj', 'critical_adj=>\\w+_adj', 'health_adj=>\\w+_adj', 'somatic_adj=>\\w+_adj', 'marvelous_adj=>\\w+_adj', 'hygienic_adj=>\\w+_adj']


## Get NGram Links

Get Links<br>
<b>Set Language Here</b>

In [7]:
# Set subcorpus here:
# change tag after "books/20200217/"
# Twice for the url, once for the pattern
# eng or eng-fiction or eng-us
links_url: str = 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-0-ngrams_exports.html'  # url to list with links
p_gzfile: re.Pattern = re.compile(r'http://storage\.googleapis\.com/books/ngrams/books/20200217/eng/.+\.gz')  # pattern for links to gzip files

# Downloads all links to the gzip files with starting ngram
soup: BeautifulSoup = BeautifulSoup(urllib.request.urlopen(links_url).read(), 'html.parser')
links: list[tuple[str, str]] = [(a.text, a['href']) for a in soup.find_all('a') if p_gzfile.search(a['href'])]

print('\n'.join([' - '.join(link) for link in links[:5]]))
print(len(links))

!=>! - http://storage.googleapis.com/books/ngrams/books/20200217/eng/0-00000-of-00857.gz
$=>28,051 - http://storage.googleapis.com/books/ngrams/books/20200217/eng/0-00001-of-00857.gz
%=>Elective - http://storage.googleapis.com/books/ngrams/books/20200217/eng/0-00002-of-00857.gz
's=>Noodt - http://storage.googleapis.com/books/ngrams/books/20200217/eng/0-00003-of-00857.gz
(=>Jorre - http://storage.googleapis.com/books/ngrams/books/20200217/eng/0-00004-of-00857.gz
857


Assign ngrams to links

In [8]:
if os.path.isfile('assigned_links.pkl'):
    assigned: dict[str, set[str]] = load_obj('assigned_links.pkl')
else:
    assigned: dict[str, set[str]] = {}

    link_links = [l[1] for l in links]
    link_labels = [l[0] for l in links]

    target: str
    for target in tqdm(targets):
        # ngram might be all Caps or some caps or capitalized
        mutated_targets: list[str] = [target, target.upper(), target.capitalize()]
        # use set for loop, as not all mutation have different starting letters (e.g. if the last word ist capitalized)
        for mutated_target in set(mutated_targets):
          for i, link_label in enumerate(link_labels):
            if link_label > mutated_target:
              link: str = link_links[i-1]
              break
          else:
            link: str = link_links[-1]
          if link not in assigned:
            assigned[link] = set()
          assigned[link].add(target)  # still assign all lower ngram, as ngram is checked against str.lower()
    
    save_obj(assigned, 'assigned_links.pkl')

first_key: str = list(assigned.keys())[0]
print(first_key)
print(assigned[first_key])

http://storage.googleapis.com/books/ngrams/books/20200217/eng/0-00554-of-00857.gz
{'greatest', 'greater', 'gross', 'great'}


# Download and Extraction

## Function

In [9]:
def download_extract(process_count: int, assigned_links: dict[str, set[str]]):
    """
    This function downloads the files needed for the analysis and extracts the data
    """

    print(f'Process {process_count} started...')

    amt_files_to_dl: int = len(assigned_links)
    downloaded: int = 0

    
    # load csv with ngram counts if it already exists, otherwise create empty
    if os.path.isfile(f'ngram_checkpoint_Process{process_count}.csv'):
        ngram_df: pd.DataFrame = pd.read_csv(f'ngram_checkpoint_Process{process_count}.csv')
    else:
        ngram_df: pd.DataFrame = pd.DataFrame({
        'ngram': pd.Series([], dtype='str'),
        'year': pd.Series([], dtype='int'),
        'count': pd.Series([], dtype='int')
      })
    
    if os.path.isfile(f'completed_files_Process{process_count}.pkl'):
        completed_files: list[str] = load_obj(f'completed_files_Process{process_count}.pkl')
    else:
        completed_files: list[str] = []
    
    # Loop over links
    # start = datetime.now()
    for link, targets in assigned_links.items():
        target_summarized: str = '(' + '|'.join(list(targets)) + ')'
        p_target: re.Pattern = re.compile(fr'{target_summarized}_adj=>\w+_adj')
        ngramfile: str = link.split('/')[-1]
        downloaded += 1
        linkstart: datetime = datetime.now()

        if ngramfile in completed_files:
          continue

        print(f'Process {process_count}: File: "{ngramfile}" (No. {downloaded}/{amt_files_to_dl})')

        urllib.request.urlretrieve(link, ngramfile)

        relevant_entries: list[list[str|int]] = []
        with gzip.open(ngramfile, mode='rb') as infile:
            for line in infile.readlines():
                splitline: list[str] = line.decode('utf-8').split('\t')
                ngram: str = splitline[0].lower()
                if p_target.search(ngram):  # None converts to false, match to true
                  for year in splitline[1:]:
                    relevant_entries.append([ngram, *map(int, year.split(',')[:2])])
        new_ngrams = pd.DataFrame.from_records(relevant_entries, columns=["ngram", "year", "count"])
        ngram_df = pd.concat([ngram_df, new_ngrams])
        ngram_df.to_csv(f'ngram_checkpoint_Process{process_count}.csv', index=False)
        print(f'Process {process_count}: {ngramfile} finished in {datetime.now() - linkstart}')
    
        completed_files.append(ngramfile)
        save_obj(completed_files, f'completed_files_Process{process_count}.pkl')
        os.remove(ngramfile)  # remove downloaded file
        gc.collect()

        # Careful: Removes all files in trash (f"title = '{ngramfile}' and trashed=true")
        viable_errors = (FileNotFoundError, googleapiclient.errors.HttpError, pydrive.files.ApiRequestError, pydrive.settings.InvalidConfigError)
        for file_to_remove in my_drive.ListFile({'q': "trashed=true"}).GetList():
          try:
            file_to_remove.Delete()
          except viable_errors as e:
            print(f"Error in Removing file {ngramfile} in Process {process_count}")
            try:
              file_to_remove.Delete()
            except viable_errors as e:
              pass

## Run Download and extraction

In [12]:
# download_extract(assigned)
NUM_PROCESSES = 4

n_shares: int
rest: int
assigned_list: list[str] = sorted(list(assigned.keys()))
n_shares, rest = divmod(len(assigned_list), NUM_PROCESSES)
ngram_shares: list[list[str]] = []
for i in range(0, NUM_PROCESSES-1):
  ngram_shares.append(assigned_list[n_shares*i:n_shares*(i+1)])
ngram_shares.append(assigned_list[-(n_shares+rest):])

processes = []
for process_num in range(NUM_PROCESSES):
  process_assignment = {link: ngrams for link, ngrams in assigned.items() if link in ngram_shares[process_num]}
  processes.append(mp.Process(target=download_extract, args=(process_num+1, process_assignment, )))

for process in processes:
  process.start()

for process in processes:
  process.join()
 
print('_'*30 + '\n\nDONE')

Process 1 started...
Process 2 started...
Process 3 started...
Process 4 started...
______________________________

DONE


# Join Process Checkpoints

In [11]:
files = [f for f in os.listdir() if 'ngram_checkpoint' in f]

df = pd.read_csv(files[0])
for i in range(1, len(files)):
    df = df.append(pd.read_csv(files[i]))

df.to_csv('./data/ngram_dependencies_adjectives.csv', index=False)

# Then: move checkpoint files