## Importing libraries

In [None]:
# Importing libraries.
import os
import requests
import re
import string
import nltk
import pandas as pd

from tqdm import tqdm
from ebooklib import epub
from collections import Counter
from textblob import TextBlob
from lingua import Language, LanguageDetectorBuilder

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

## Global variables

In [None]:
# Load data.
directory = 'data' 
file_name = 'Hesse, Herman - Siddhartha, eine indische Dichting.epub'
file_path = os.path.join('data', file_name)

## Import data

In [None]:
# Import the book.
book = epub.read_epub(file_path)

## Create a list of all the unique words in the book

In [None]:
def create_unique_words_list(book):
    # Variable in which the raw text of the book will be stored.
    text = ''
    
    # Loop through book items.
    for item in book.get_items():
        if isinstance(item, epub.EpubHtml): # Get all EpubHTML items out of the book. 
            text += item.get_content().decode('utf-8') # Add it to the text variable.

    # Clean the text of HTML related element.
    cleaned_text = re.sub('<.*?>', '', text)
    
    # Remove all punctuation marks.
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))

    # Turn it into a list.
    word_list = cleaned_text.split()
    print(f'The book contains {len(word_list)} words.')
    
    # Get unique words.
    unique_words = list(set(word_list))
    print(f'There are {len(unique_words)} unique words.')
    
    return word_list, unique_words

In [None]:
word_list, unique_words = create_unique_words_list(book)

In [None]:
unique_words

## Delete words that are not German

In [None]:
def detect_languages(word_list):
    # Possible languages that are present in the book.
    languages = [Language.ENGLISH, Language.GERMAN]
    detector = LanguageDetectorBuilder.from_languages(*languages).build()

    german_words = []
    
    # Loop through all words.
    for word in word_list:
        # Obtain confidence intervals.
        confidence_values = detector.compute_language_confidence_values(word)
        
        # Loop through confidence intervals.
        for language, value in confidence_values:
            if language.name == 'GERMAN' and value >= 0.5: # Statement should be adjusted if there are more than 2
                # languages present in the book, but this is unlikely.
                german_words.append(word)
        
    print(f'There were {len(word_list)} unique words present in the book.')
    print(f'After language detection, there are {len(german_words)} German words left.')
    
    return german_words

In [None]:
# This might take some time.
unique_words = detect_languages(unique_words)

In [None]:
unique_words

## Print the most occuring words in the book

In [None]:
def print_most_common_words(word_list, number_of_elements):
    # Use the Counter method to count the occurrences of each word in the word_list
    word_counts = Counter(word_list)

    # Get the number_of_elements most common words from the word_counts
    most_common_words = word_counts.most_common(number_of_elements)

    # Print the header for the output
    print(f'The top {number_of_elements} most occurent words are:\n')

    # Loop through the most_common_words and print each word and its count
    for word, count in most_common_words:
        print(f'{word}: {count}')

In [None]:
print_most_common_words(word_list, 10)

## Categorize all the words into nouns, verbs, adjectives etc.

In [None]:
def categorize_words(unique_words):
    # Use nltk to categorize each word in the list
    tagged_words = nltk.pos_tag(unique_words)
    
    # Convert the tagged words into a pandas dataframe
    df = pd.DataFrame(tagged_words, columns=['word', 'type'])
    
    # Sort the dataframe first by the type of word and then alphabetically by the word
    df = df.sort_values(by=['type', 'word'], ascending=[False, True])
    
    # Reset the index of the dataframe
    df = df.reset_index(drop=True)
    
    # Return both the tagged words and the sorted dataframe
    return tagged_words, df

In [None]:
tagged_words, df = categorize_words(unique_words)
tagged_words

## Print how much words there are in each category

In [None]:
def number_of_words_in_category(tagged_words):
    # Create an empty dictionary to store the frequency count of each category
    frequency_count = {}

    # Loop through each word and its category in tagged_words
    for _, value in tagged_words:
        # Check if the category is already present in the frequency_count dictionary
        if value in frequency_count:
            # If yes, increment its count by 1
            frequency_count[value] += 1
        else:
            # If not, add the category to the dictionary with a count of 1
            frequency_count[value] = 1
            
    # Sort the frequency_count dictionary in descending order based on the count of each category
    frequency_count = {k: v for k, v in sorted(frequency_count.items(), key=lambda item: item[1], reverse=True)}
    
    # Print the total number of unique words in the book and the count of each category
    print(f'In total there are {len(tagged_words)} unique words present in the book. Of these there are: \n')
    for i,j in frequency_count.items():
        print(i,j)
        
    return frequency_count

In [None]:
frequency_count = number_of_words_in_category(tagged_words)

## Print category meanings

In [None]:
for i,_ in frequency_count.items():
    nltk.help.upenn_tagset(i)

## Translate the German words into Dutch

In [None]:
# Obtain German nouns 
german_nouns = df[df.type == 'NNP'].reset_index(drop=True)
german_nouns.head()

In [None]:
def translate_words(df, source_language='de', target_language='nl'):
    # Initialize an empty list to store the translated words
    translated_words = []
      
    # Loop through each word in the 'word' column of the input DataFrame, using tqdm to display a progress bar
    for word in tqdm(df['word'], desc='Translating words'):
        # Create a TextBlob object from the current word
        blob = TextBlob(word)
        
        # Attempt to translate the word from the source language to the target language
        try:
            translated_word = blob.translate(from_lang=source_language, to=target_language)
        except:
            # If the translation fails, assign an empty string to the translated word
            translated_word = ''
        
        # Convert the translated word to a string and add it to the list of translated words
        translated_words.append(str(translated_word))
      
    # Add a new column to the input DataFrame containing the translated words, and reset the index of the DataFrame
    return df.assign(translation=translated_words).reset_index(drop=True)

In [None]:
full_df = translate_words(german_nouns)
full_df

In [None]:
full_df.to_csv('hesse.csv')