In [1]:
import numpy as np
import pandas as pd
import os, sys
import glob

from collections import Counter
import matplotlib.pyplot as plt

In [2]:
repos = os.path.join(os.getcwd(), os.pardir, os.pardir)
gutenberg_repo_path = os.path.join(repos, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos, 'gutenberg-analysis')

In [3]:
## import internal helper functions
analysis_src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(analysis_src_dir)
from data_io import get_book

gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query
from jsd import jsdalpha

# Paths etc. you should set

In [4]:
# If True, uses the static database names, if false, uses the ones from the gutenberg repo
USE_STATIC_DB=True

In [5]:
GUTENBERG_STATIC_DATABASE = '/Users/dean/Documents/GradSchool/TheoryOfMachineLearning/gutenberg_static_database'

## You probably don't need to change this

In [6]:
# gutenberg_info_fold will be whatever the equivalent to your top level directory of the repo is
# If using the static dataset, you will have to put it in that structure

if USE_STATIC_DB is True:
    gutenberg_info_fold = GUTENBERG_STATIC_DATABASE
    filter_exist = False
else:
    gutenberg_info_fold = gutenberg_repo_path
    filter_exist=True
    
metadata_filepath = os.path.join(gutenberg_info_fold, 'metadata', 'metadata.csv')

# Load the Metadata

In [7]:
mq = meta_query(path=metadata_filepath, filter_exist=filter_exist)

mq.df.head()



Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,language_set
0,PG0,,,,,,,set(),Text,
1,PG1,The Declaration of Independence of the United ...,"Jefferson, Thomas",1743.0,1826.0,['en'],604.0,"{'United States -- History -- Revolution, 1775...",Text,{en}
2,PG2,The United States Bill of Rights: The Ten Orig...,United States,,,['en'],158.0,"{'Civil rights -- United States -- Sources', '...",Text,{en}
3,PG3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,['en'],28.0,{'Presidents -- United States -- Inaugural add...,Text,{en}
4,PG4,Lincoln's Gettysburg Address: Given November 1...,"Lincoln, Abraham",1809.0,1865.0,['en'],55.0,{'Consecration of cemeteries -- Pennsylvania -...,Text,{en}


## Let's add line counts

In [8]:
if not USE_STATIC_DB:
    mq.add_line_count()

# Distribution of Languages

In [9]:
# Note: Built-in method doesn't properly handle multiple languages
mq.get_lang_counts()

Counter({'en': 47120,
         'fr': 2892,
         'fi': 1903,
         'de': 1680,
         'nl': 787,
         'it': 724,
         'es': 601,
         'pt': 550,
         'zh': 441,
         'el': 220,
         'sv': 186,
         'hu': 183,
         'eo': 118,
         'la': 116,
         'da': 68,
         'tl': 60,
         'ca': 33,
         'pl': 31,
         'ja': 22,
         'no': 19,
         'cy': 12,
         'cs': 10,
         'ru': 9,
         'is': 7,
         'fur': 7,
         'bg': 6,
         'he': 6,
         'enm': 6,
         'te': 6,
         'ang': 4,
         'sr': 4,
         'af': 4,
         'nai': 3,
         'nah': 3,
         'ilo': 3,
         'ceb': 3,
         'grc': 3,
         'ro': 2,
         'myn': 2,
         'ga': 2,
         'fy': 2,
         'mi': 2,
         'nav': 2,
         'arp': 2,
         'gla': 2,
         'brx': 2,
         'ko': 1,
         'sa': 1,
         'ale': 1,
         'yi': 1,
         'lt': 1,
         'kha': 1,
        

## Lets see the different language combinations

In [10]:
# Lets figure out how many books have more than one language

lang_info = [(lang, len(lang), count) for lang, count in mq.df['language_set'].value_counts().items()]
lang_df = pd.DataFrame(lang_info, columns = ['Languages', 'Num Languages', 'Num Books'])
lang_df

Unnamed: 0,Languages,Num Languages,Num Books
0,{en},1,46972
1,{fr},1,2864
2,{fi},1,1903
3,{de},1,1644
4,{nl},1,782
...,...,...,...
102,"{mi, en}",2,1
103,{et},1,1
104,"{de, ang}",2,1
105,{fa},1,1


In [11]:
lang_df['Num Languages'].value_counts()

Num Languages
1    56
2    49
3     2
Name: count, dtype: int64

## Lets look only at those with more than one language

In [12]:
# Lets figure out how many books have more than one language

more_than_one_lang = [(lang, len(lang), count) for lang, count in mq.df['language_set'].value_counts().items() if len(lang) > 1]
lang_df = pd.DataFrame(more_than_one_lang, columns = ['Languages', 'Num Languages', 'Num Books'])
lang_df.head()

Unnamed: 0,Languages,Num Languages,Num Books
0,"{de, en}",2,29
1,"{la, en}",2,23
2,"{eo, en}",2,19
3,"{es, en}",2,16
4,"{en, fr}",2,16


In [13]:
lang_df.sort_values('Num Books', ascending=False)[['Languages', 'Num Books']]

Unnamed: 0,Languages,Num Books
0,"{de, en}",29
1,"{la, en}",23
2,"{eo, en}",19
3,"{es, en}",16
4,"{en, fr}",16
5,"{zh, en}",7
6,"{enm, en}",3
7,"{en, cy}",3
8,"{ang, en}",3
9,"{it, en}",3


What are the language combinations with more than 2?

In [14]:
lang_df[lang_df['Num Languages']>2]

Unnamed: 0,Languages,Num Languages,Num Books
15,"{es, en, fr}",3,2
17,"{es, tl, en}",3,2


## Lets look only at books with one language

In [15]:
one_lang = [(lang, count) for lang, count in mq.df['language_set'].value_counts().items() if len(lang) == 1]
one_lang_df = pd.DataFrame(one_lang, columns=['language', 'numBooks'])
one_lang_df.head(10)

Unnamed: 0,language,numBooks
0,{en},46972
1,{fr},2864
2,{fi},1903
3,{de},1644
4,{nl},782
5,{it},720
6,{es},577
7,{pt},548
8,{zh},434
9,{el},216


# Let's Make Graphs

Note: This needs to be adapted to handle multiple language sets

In [16]:
# Lets figure out how many books have more than one language

lang_info = [(lang, count) for lang, count in mq.df['language_set'].value_counts().items()]
lang_info

[({'en'}, 46972),
 ({'fr'}, 2864),
 ({'fi'}, 1903),
 ({'de'}, 1644),
 ({'nl'}, 782),
 ({'it'}, 720),
 ({'es'}, 577),
 ({'pt'}, 548),
 ({'zh'}, 434),
 ({'el'}, 216),
 ({'sv'}, 185),
 ({'hu'}, 182),
 ({'eo'}, 97),
 ({'la'}, 86),
 ({'da'}, 67),
 ({'tl'}, 57),
 ({'ca'}, 32),
 ({'pl'}, 29),
 ({'de', 'en'}, 29),
 ({'en', 'la'}, 23),
 ({'ja'}, 22),
 ({'no'}, 19),
 ({'en', 'eo'}, 19),
 ({'en', 'es'}, 16),
 ({'en', 'fr'}, 16),
 ({'cy'}, 9),
 ({'ru'}, 9),
 ({'cs'}, 9),
 ({'en', 'zh'}, 7),
 ({'is'}, 7),
 ({'fur'}, 6),
 ({'he'}, 6),
 ({'te'}, 6),
 ({'bg'}, 6),
 ({'sr'}, 4),
 ({'af'}, 4),
 ({'en', 'enm'}, 3),
 ({'cy', 'en'}, 3),
 ({'ang', 'en'}, 3),
 ({'en', 'it'}, 3),
 ({'en', 'grc'}, 3),
 ({'enm'}, 3),
 ({'en', 'nah'}, 2),
 ({'gla'}, 2),
 ({'el', 'la'}, 2),
 ({'fr', 'nl'}, 2),
 ({'fr', 'la'}, 2),
 ({'en', 'es', 'fr'}, 2),
 ({'arp'}, 2),
 ({'ilo'}, 2),
 ({'brx', 'en'}, 2),
 ({'en', 'es', 'tl'}, 2),
 ({'fy'}, 2),
 ({'ro'}, 2),
 ({'ga'}, 2),
 ({'la', 'nl'}, 2),
 ({'ceb'}, 2),
 ({'en', 'nav'}, 2),
 (

In [17]:
lang_count = mq.get_lang_counts()

In [18]:
arr_n = []
arr_l = []
#for l,n in lang_info:
for l,n in lang_count.most_common():
    arr_n += [n]
    arr_l += [l]
arr_n = np.array(arr_n)
arr_l = np.array(arr_l)

In [19]:
lang_count.most_common()

[('en', 47120),
 ('fr', 2892),
 ('fi', 1903),
 ('de', 1680),
 ('nl', 787),
 ('it', 724),
 ('es', 601),
 ('pt', 550),
 ('zh', 441),
 ('el', 220),
 ('sv', 186),
 ('hu', 183),
 ('eo', 118),
 ('la', 116),
 ('da', 68),
 ('tl', 60),
 ('ca', 33),
 ('pl', 31),
 ('ja', 22),
 ('no', 19),
 ('cy', 12),
 ('cs', 10),
 ('ru', 9),
 ('is', 7),
 ('fur', 7),
 ('bg', 6),
 ('he', 6),
 ('enm', 6),
 ('te', 6),
 ('ang', 4),
 ('sr', 4),
 ('af', 4),
 ('nai', 3),
 ('nah', 3),
 ('ilo', 3),
 ('ceb', 3),
 ('grc', 3),
 ('ro', 2),
 ('myn', 2),
 ('ga', 2),
 ('fy', 2),
 ('mi', 2),
 ('nav', 2),
 ('arp', 2),
 ('gla', 2),
 ('brx', 2),
 ('ko', 1),
 ('sa', 1),
 ('ale', 1),
 ('yi', 1),
 ('lt', 1),
 ('kha', 1),
 ('ia', 1),
 ('kld', 1),
 ('oc', 1),
 ('nap', 1),
 ('gl', 1),
 ('br', 1),
 ('iu', 1),
 ('bgs', 1),
 ('csb', 1),
 ('rmr', 1),
 ('sl', 1),
 ('oji', 1),
 ('ar', 1),
 ('et', 1),
 ('fa', 1)]

In [20]:
###########
## Setup ##
###########
# number of pt for column in latex-document
fig_width_pt = 510  # single-column:510, double-column: 246; Get this from LaTeX using \showthe\columnwidth
inches_per_pt = 1.1/72.27 # Convert pt to inches
width_vs_height = (np.sqrt(5)-1.0)/2.0 # Ratio of height/width [(np.sqrt(5)-1.0)/2.0]
fig_width = fig_width_pt*inches_per_pt  # width in inches
fig_height = width_vs_height*fig_width  # height in inches
fig_size = [fig_width,fig_height]

# here you can set the parameters of the plot (fontsizes,...) in pt
params = {'backend': 'ps',
          'axes.titlesize':16,
          'axes.labelsize': 14,
#          'text.fontsize': 12,
          'legend.fontsize': 12,
#           'figtext.fontsize': 12,
          'xtick.labelsize': 12,
          'ytick.labelsize': 12,
          
 #         'text.usetex': True,
 #         'ps.usedistiller' : 'xpdf',
          'figure.figsize': fig_size,
#          'text.latex.unicode':True,
#          'text.latex.preamble': [r'\usepackage{bm}'],
          
          'xtick.direction':'out',
          'ytick.direction':'out',
          
          'axes.spines.right' : False,
          'axes.spines.top' : False
         }
plt.rcParams.update(params)

In [23]:
## restrict to e.g. more than 10 books!
## we could do the remaining in an inset

indmax = 22#len(arr_n)



# f=plt.figure(figsize = (10,20))
f=plt.figure()
ax=f.add_subplot(1,1,1)

x = np.arange(indmax)
y = arr_n[:indmax]
z = arr_l[:indmax]
ax.bar(x,y,width = 0.5,label='No. books')

ax.set_xlabel(r'Language')
#ax.set_ylabel(r'$N$')
ax.set_title('Book Count by Language')
ax.set_xticks(x)
ax.set_xticklabels(z,rotation=45, ha='right')

# ax.set_xlim(1500,2015)
# ax.set_ylim(1,10**5)

ax.set_ylim(1,10**5)
# ax.set_xscale('log')
ax.set_yscale('log')
# plt.subplots_adjust(left=0.1,bottom=0.4)

path_save = os.path.join(gutenberg_info_fold,'figures')
os.makedirs(path_save, exist_ok=True)
fname_save = 'basicstats-language.png'
filename = os.path.join(path_save,fname_save)
#plt.savefig(filename)

In [24]:
plt.savefig(filename)

# Looking at Token Count
Skipping for now, some books missing from the static dataset

In [None]:
# i_l = 0
# l = arr_l[i_l]
path_text = os.path.abspath(os.path.join(gutenberg_info_fold,'data','counts'))

list_n_tokens = []
missing_books = 0
for i_l,l in enumerate(arr_l):
    print(l)
    n_tokens = 0
    mq.reset()
    mq.filter_lang(l)
    list_books = mq.get_ids()
    for pg_id in list_books:
        filename = os.path.join(path_text,'%s_counts.txt'%(pg_id))
        if not os.path.exists(filename):
            missing_books += 1
            continue
        with open(filename) as f:
            x = f.readlines()
        if len(x)>1:
            list_n = [int(h.split()[1]) for h in x]
            N = sum(list_n)
            n_tokens+=N
    list_n_tokens += [n_tokens]


In [25]:
# Add graphing when have the full dataset