## Importing libraries

In [94]:
# Importing libraries.
import os
import requests
import re
import string
import nltk
import pandas as pd

from tqdm import tqdm
from ebooklib import epub
from collections import Counter
from textblob import TextBlob
from lingua import Language, LanguageDetectorBuilder

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\afbru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\afbru\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\afbru\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

## Global variables

In [3]:
# Load data.
directory = 'data' 
file_name = 'Hesse, Herman - Siddhartha, eine indische Dichting.epub'
file_path = os.path.join('data', file_name)

## Import data

In [4]:
# Import the book.
book = epub.read_epub(file_path)



## Create a list of all the unique words in the book

In [5]:
def create_unique_words_list(book):
    # Variable in which the raw text of the book will be stored.
    text = ''
    
    # Loop through book items.
    for item in book.get_items():
        if isinstance(item, epub.EpubHtml): # Get all EpubHTML items out of the book. 
            text += item.get_content().decode('utf-8') # Add it to the text variable.

    # Clean the text of HTML related element.
    cleaned_text = re.sub('<.*?>', '', text)
    
    # Remove all punctuation marks.
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))

    # Turn it into a list.
    word_list = cleaned_text.split()
    print(f'The book contains {len(word_list)} words.')
    
    # Get unique words.
    unique_words = list(set(word_list))
    print(f'There are {len(unique_words)} unique words.')
    
    return word_list, unique_words

In [6]:
word_list, unique_words = create_unique_words_list(book)

The book contains 37418 words.
There are 6415 unique words.


In [7]:
unique_words

['brennendem',
 'flocht',
 'Hörenden',
 'zwingest',
 'holten',
 'Heiliger',
 'Kinde',
 'geplagt',
 'Widerstreit',
 'Feld',
 'zweifelte',
 'ordnete',
 'Ruhig',
 'erbricht',
 'Lebens',
 'gewußte',
 'leichtgläubig',
 'Plötzlich',
 'Vedas',
 'welch',
 'reiner',
 'Schicksal',
 'Öl',
 'solches',
 'daran',
 'Gewinn',
 'Tracht',
 'SamanaJahren',
 'undurchdringliche',
 'langweilig',
 'folgen',
 'schreiten',
 'einzulassen',
 'one',
 'Reiswein',
 'bist',
 'Buddhaschaft',
 'Affenvolk',
 'walks',
 'bereit',
 'Laufen',
 '3',
 'dafür',
 'berichtete',
 'zögern',
 'weiterreisen',
 'regulating',
 'ehrwürdig',
 'certain',
 'vorgezogen',
 'an',
 'eingehe',
 'apply',
 'they',
 'lauten',
 'antreten',
 'Verständnis',
 'our',
 'schlagen',
 'Wesentliche',
 'BRAHMANEN',
 'trüben',
 'klein',
 'OF',
 'Letzten',
 'viele',
 'redet',
 'KamaswamiMenschen',
 'fee',
 'allein',
 'verstrickt',
 'Seufzend',
 'lehrten',
 'rechttun',
 'ziehe',
 'hoch',
 'bewegte',
 'Zahlen',
 'rasch',
 'Verfolgung',
 'hielten',
 'tax',
 'va

## Delete words that are not German

In [8]:
def detect_languages(word_list):
    # Possible languages that are present in the book.
    languages = [Language.ENGLISH, Language.GERMAN]
    detector = LanguageDetectorBuilder.from_languages(*languages).build()

    german_words = []
    
    # Loop through all words.
    for word in word_list:
        # Obtain confidence intervals.
        confidence_values = detector.compute_language_confidence_values(word)
        
        # Loop through confidence intervals.
        for language, value in confidence_values:
            if language.name == 'GERMAN' and value >= 0.5: # Statement should be adjusted if there are more than 2
                # languages present in the book, but this is unlikely.
                german_words.append(word)
        
    print(f'There were {len(word_list)} unique words present in the book.')
    print(f'After language detection, there are {len(german_words)} German words left.')
    
    return german_words

In [9]:
# This might take some time.
unique_words = detect_languages(unique_words)

There were 6415 unique words present in the book.
After language detection, there are 5449 German words left.


In [10]:
unique_words

['brennendem',
 'flocht',
 'Hörenden',
 'zwingest',
 'holten',
 'Heiliger',
 'Kinde',
 'geplagt',
 'Widerstreit',
 'Feld',
 'zweifelte',
 'ordnete',
 'Ruhig',
 'erbricht',
 'Lebens',
 'gewußte',
 'leichtgläubig',
 'Plötzlich',
 'Vedas',
 'welch',
 'reiner',
 'Schicksal',
 'Öl',
 'solches',
 'daran',
 'Gewinn',
 'Tracht',
 'SamanaJahren',
 'undurchdringliche',
 'langweilig',
 'folgen',
 'schreiten',
 'einzulassen',
 'one',
 'Reiswein',
 'bist',
 'Buddhaschaft',
 'Affenvolk',
 'bereit',
 'Laufen',
 'dafür',
 'berichtete',
 'zögern',
 'weiterreisen',
 'ehrwürdig',
 'vorgezogen',
 'eingehe',
 'lauten',
 'antreten',
 'Verständnis',
 'schlagen',
 'Wesentliche',
 'BRAHMANEN',
 'trüben',
 'klein',
 'Letzten',
 'viele',
 'redet',
 'KamaswamiMenschen',
 'allein',
 'verstrickt',
 'Seufzend',
 'lehrten',
 'rechttun',
 'ziehe',
 'hoch',
 'bewegte',
 'Zahlen',
 'rasch',
 'Verfolgung',
 'hielten',
 'lächelnden',
 'Erntezeiten',
 'Dr',
 'rühren',
 'jung',
 'faden',
 'Kleinen',
 'woher',
 'Zuschauer',


## Print the most occuring words in the book

In [11]:
def print_most_common_words(word_list, number_of_elements):
    # Use the Counter method to count the occurrences of each word in the word_list
    word_counts = Counter(word_list)

    # Get the number_of_elements most common words from the word_counts
    most_common_words = word_counts.most_common(number_of_elements)

    # Print the header for the output
    print(f'The top {number_of_elements} most occurent words are:\n')

    # Loop through the most_common_words and print each word and its count
    for word, count in most_common_words:
        print(f'{word}: {count}')

In [12]:
print_most_common_words(word_list, 10)

The top 10 most occurent words are:

und: 1254
er: 810
der: 667
zu: 546
in: 501
die: 479
nicht: 411
den: 385
Siddhartha: 373
war: 369


## Categorize all the words into nouns, verbs, adjectives etc.

In [29]:
def categorize_words(unique_words):
    # Use nltk to categorize each word in the list
    tagged_words = nltk.pos_tag(unique_words)
    
    # Convert the tagged words into a pandas dataframe
    df = pd.DataFrame(tagged_words, columns=['word', 'type'])
    
    # Sort the dataframe first by the type of word and then alphabetically by the word
    df = df.sort_values(by=['type', 'word'], ascending=[False, True])
    
    # Reset the index of the dataframe
    df = df.reset_index(drop=True)
    
    # Return both the tagged words and the sorted dataframe
    return tagged_words, df

In [30]:
tagged_words, df = categorize_words(unique_words)
tagged_words

[('brennendem', 'NN'),
 ('flocht', 'NN'),
 ('Hörenden', 'NNP'),
 ('zwingest', 'NNP'),
 ('holten', 'NN'),
 ('Heiliger', 'NNP'),
 ('Kinde', 'NNP'),
 ('geplagt', 'NN'),
 ('Widerstreit', 'NNP'),
 ('Feld', 'NNP'),
 ('zweifelte', 'NNP'),
 ('ordnete', 'JJ'),
 ('Ruhig', 'NNP'),
 ('erbricht', 'NN'),
 ('Lebens', 'NNP'),
 ('gewußte', 'NN'),
 ('leichtgläubig', 'NN'),
 ('Plötzlich', 'NNP'),
 ('Vedas', 'NNP'),
 ('welch', 'NN'),
 ('reiner', 'NN'),
 ('Schicksal', 'NNP'),
 ('Öl', 'NNP'),
 ('solches', 'VBZ'),
 ('daran', 'JJ'),
 ('Gewinn', 'NNP'),
 ('Tracht', 'NNP'),
 ('SamanaJahren', 'NNP'),
 ('undurchdringliche', 'NN'),
 ('langweilig', 'NN'),
 ('folgen', 'NN'),
 ('schreiten', 'VBD'),
 ('einzulassen', 'CC'),
 ('one', 'CD'),
 ('Reiswein', 'NNP'),
 ('bist', 'NN'),
 ('Buddhaschaft', 'NNP'),
 ('Affenvolk', 'NNP'),
 ('bereit', 'NN'),
 ('Laufen', 'NNP'),
 ('dafür', 'VBZ'),
 ('berichtete', 'JJ'),
 ('zögern', 'JJ'),
 ('weiterreisen', 'NN'),
 ('ehrwürdig', 'NN'),
 ('vorgezogen', 'NN'),
 ('eingehe', 'NN'),
 ('lau

## Print how much words there are in each category

In [15]:
def number_of_words_in_category(tagged_words):
    # Create an empty dictionary to store the frequency count of each category
    frequency_count = {}

    # Loop through each word and its category in tagged_words
    for _, value in tagged_words:
        # Check if the category is already present in the frequency_count dictionary
        if value in frequency_count:
            # If yes, increment its count by 1
            frequency_count[value] += 1
        else:
            # If not, add the category to the dictionary with a count of 1
            frequency_count[value] = 1
            
    # Sort the frequency_count dictionary in descending order based on the count of each category
    frequency_count = {k: v for k, v in sorted(frequency_count.items(), key=lambda item: item[1], reverse=True)}
    
    # Print the total number of unique words in the book and the count of each category
    print(f'In total there are {len(tagged_words)} unique words present in the book. Of these there are: \n')
    for i,j in frequency_count.items():
        print(i,j)
        
    return frequency_count

In [16]:
frequency_count = number_of_words_in_category(tagged_words)

In total there are 5449 unique words present in the book. Of these there are: 

NNP 2388
NN 1739
JJ 456
VBZ 206
VBP 164
VBD 134
NNS 132
FW 47
RB 47
VB 25
JJS 21
VBN 21
CD 14
IN 10
PRP$ 9
MD 8
CC 6
JJR 5
PRP 4
WRB 4
RBR 2
VBG 2
RP 2
WP 1
NNPS 1
DT 1


## Print category meanings

In [17]:
for i,_ in frequency_count.items():
    nltk.help.upenn_tagset(i)

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminat

## Translate the German words into Dutch

In [33]:
# Obtain German nouns 
german_nouns = df[df.type == 'NNP'].reset_index(drop=True)
german_nouns.head()

Unnamed: 0,word,type
0,AM,NNP
1,Aas,NNP
2,Abend,NNP
3,Abendhunger,NNP
4,Abendstunde,NNP


In [97]:
def translate_words(df, source_language='de', target_language='nl'):
    # Initialize an empty list to store the translated words
    translated_words = []
      
    # Loop through each word in the 'word' column of the input DataFrame, using tqdm to display a progress bar
    for word in tqdm(df['word'], desc='Translating words'):
        # Create a TextBlob object from the current word
        blob = TextBlob(word)
        
        # Attempt to translate the word from the source language to the target language
        try:
            translated_word = blob.translate(from_lang=source_language, to=target_language)
        except:
            # If the translation fails, assign an empty string to the translated word
            translated_word = ''
        
        # Convert the translated word to a string and add it to the list of translated words
        translated_words.append(str(translated_word))
      
    # Add a new column to the input DataFrame containing the translated words, and reset the index of the DataFrame
    return df.assign(translation=translated_words).reset_index(drop=True)

In [99]:
full_df = translate_words(german_nouns)
full_df

Translating words: 100%|███████████████████████████████████████████████████████████| 2388/2388 [38:34<00:00,  1.03it/s]


Unnamed: 0,word,type,translation
0,AM,NNP,BIJ DE
1,Aas,NNP,
2,Abend,NNP,Avond
3,Abendhunger,NNP,Hongerig
4,Abendstunde,NNP,Avond
...,...,...,...
2383,üblich,NNP,gewoonlijk
2384,übrig,NNP,overlaten
2385,übrigen,NNP,op de andere
2386,übte,NNP,beoefend


In [100]:
full_df.to_csv('hesse.csv')