In [None]:
# Remove the stopwords and find out the top 15 words used most in each novel

import re
import os
from collections import Counter

stopwords = set()
#add stopwords file
with open("english_stopwords.txt", encoding = "utf8") as stopword_file:
    for line in stopword_file:
        word = line.strip()
        stopwords.add(word)
# remove stopwords and gengerate words
def count_words(filepath):
    with open(filepath, 'r', encoding='utf8') as file:
        text = file.read().lower()
    words = re.findall(r'\b\w+\b', text)
    filtered_words = []
    for word in words:
        if word not in stopwords and len(word) > 3:
            filtered_words.append(word)
    word_counts = Counter(filtered_words)
    return word_counts
       
for file in os.scandir("plain_text_hesse"):    
    
    if file.is_file():
        word_counts = count_words(file.path)
        print(f"Most common words in {file.name}:")
        for word, count in word_counts.most_common(15):
            print(f"{word}:{count}")

Most common words in beneath_the_wheel.txt:
hans:303
could:139
would:135
time:134
like:100
felt:90
good:86
back:84
father:82
heilner:75
boys:74
even:73
well:73
away:72
home:71
Most common words in demian.txt:
like:156
world:150
time:143
demian:142
could:135
would:125
said:123
know:108
felt:107
something:103
gutenberg:97
mother:97
much:90
life:90
project:88
Most common words in gertrude.txt:
would:174
time:171
could:146
good:120
muoth:119
life:109
like:108
said:108
thought:98
music:96
looked:94
well:93
know:93
felt:90
long:86
Most common words in if_the_war_goes_on.txt:
world:210
people:127
good:112
time:106
life:99
every:84
years:83
great:83
even:79
must:79
would:78
many:77
like:73
friends:72
long:71
Most common words in in_sight_of_chaos.txt:
every:46
dostoevsky:28
world:26
must:23
karamazoff:21
europe:20
thing:18
time:17
karamazoffs:17
people:17
downfall:16
like:16
good:16
even:15
european:14
Most common words in knulp.txt:
knulp:212
said:111
little:83
good:74
like:74
know:66
could:6

In [None]:
#Use TF-IDF find the most weight of important for each novel

import os
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load("en_core_web_sm")

# remove stopwords and gengerate words
stopwords = set()
#add stopwords file
with open("english_stopwords.txt", encoding = "utf8") as stopword_file:
    for line in stopword_file:
        word = line.strip()
        stopwords.add(word)
        
def preprocess_text(filepath):
    with open(filepath, 'r', encoding='utf8') as file:
        text = file.read().lower()
    try:
        doc = nlp(text)
    except ValueError:
        nlp.max_length = len(text) + 1
        doc = nlp(text)
        
    filtered_words = []    
    for token in doc:
        if (token.lemma_ not in stopwords and token.ent_type_ == "" and token.pos_ in ("NOUN", "VERB", "ADJ") and len(token.lemma_) > 3):
            filtered_words.append(token.lemma_)
    return " ".join(filtered_words)  # list of text


corpus = []
doc_names = []
 
for file in os.scandir("plain_text_hesse"):
    if file.is_file() and file.name.endswith(".txt"):
        doc_names.append(file.name)
        preprocessed = preprocess_text(file.path)
        corpus.append(preprocessed)

        

# use TF-IDF vectorization
vectorizer = TfidfVectorizer()
vectorized_corpus = vectorizer.fit_transform(corpus)
document_term_matrix = pd.DataFrame(vectorized_corpus.toarray(), index=doc_names, columns=vectorizer.get_feature_names_out())
# print the top 5 weighted for each doc
for doc_name, doc_row in document_term_matrix.iterrows():
    print(f"Doc [{doc_name}] weights")
    # sort terms and get top 5
    for word, weight in doc_row.sort_values(ascending=False).head(10).items():
        print(word, weight)






## Doc [beneath_the_wheel.txt] weights
heilner 0.27122478631037056
time 0.20255014569068294
come 0.16703810716049827
look 0.15388550029746692
feel 0.15388550029746692
hand 0.14974959079356884
college 0.1336538995228618
good 0.13021080794401046
vicar 0.12697120454671873
make 0.12363450451249478

## Doc [demian.txt] weights
know 0.24113417970768597
time 0.20216299914886804
come 0.19363805340162663
look 0.19242020400916357
world 0.18998450522423743
feel 0.17537031251468072
dream 0.1631918185900501
gutenberg 0.15778785193486727
think 0.1473597764880303
make 0.14370622831064114

## Doc [gertrude.txt] weights
muoth 0.335540131670315
time 0.23991771176578772
look 0.20235483770144722
know 0.18660266470672376
gertrude 0.18130887638751825
think 0.17690901978689397
feel 0.1550983187172769
good 0.15146320187234072
come 0.13328761764765984
life 0.13086420641770238

## Doc [if_the_war_goes_on.txt] weights
world 0.29249643999433605
time 0.2089260285673829
good 0.1880334257106446
people 0.18524774532

In [21]:
# Find out the Top similarity novel by using cosine_similarity
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


# remove stopwords and gengerate words
stopwords = set()
#add stopwords file
with open("english_stopwords.txt", encoding = "utf8") as stopword_file:
    for line in stopword_file:
        word = line.strip()
        stopwords.add(word)
        
def preprocess_text(filepath):
    with open(filepath, 'r', encoding='utf8') as file:
        text = file.read().lower()
    try:
        doc = nlp(text)
    except ValueError:
        nlp.max_length = len(text) + 1
        doc = nlp(text)
        
    filtered_words = []    
    for token in doc:
        if (token.lemma_ not in stopwords and token.ent_type_ == "" and token.pos_ in ("NOUN", "VERB", "ADJ") and len(token.lemma_) > 3):
            filtered_words.append(token.lemma_)
    return " ".join(filtered_words)

filenames = []
documents = []
for file in os.scandir("plain_text_hesse"):
    if file.is_file() and file.name.endswith(".txt"):
        filenames.append(file.name)
        preprocessed = preprocess_text(file.path)
        documents.append(preprocessed)
        
        
# get trigrams with TF-IDF weighting       
vectorizer = TfidfVectorizer(ngram_range=(3,3))
tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=filenames, columns=vectorizer.get_feature_names_out())

print("TF-IDF trigram matrix finish'")


# cosine similarity measures similarity between document.
cosine_sim = cosine_similarity(tfidf_matrix)

# convert the cosine similarity matrix
cosine_sim_df = pd.DataFrame(cosine_sim, index=filenames, columns=filenames)


#print Top similar documents for each
print("Document similarities:")
for i, filename in enumerate(filenames):
    # Top 3 similar documents
    similar_docs = cosine_sim_df.iloc[i].sort_values(ascending=False)[1:4]  
    print(f"Document: {filename}")
    for doc, score in similar_docs.items():
        print(f"{doc}:{score:.4f}")

TF-IDF trigram matrix finish'
Document similarities:
Document: beneath_the_wheel.txt
knulp.txt:0.0005
the_glass_bead_game.txt:0.0004
demian.txt:0.0004
Document: demian.txt
siddhartha.txt:0.0905
rosshalde.txt:0.0008
steppenwolf.txt:0.0008
Document: gertrude.txt
knulp.txt:0.0009
demian.txt:0.0005
peter_camenzind.txt:0.0005
Document: if_the_war_goes_on.txt
the_glass_bead_game.txt:0.0031
siddhartha.txt:0.0006
knulp.txt:0.0004
Document: in_sight_of_chaos.txt
if_the_war_goes_on.txt:0.0002
peter_camenzind.txt:0.0002
demian.txt:0.0002
Document: knulp.txt
gertrude.txt:0.0009
rosshalde.txt:0.0008
beneath_the_wheel.txt:0.0005
Document: narziss_and_goldmund.txt
the_glass_bead_game.txt:0.0079
steppenwolf.txt:0.0005
rosshalde.txt:0.0005
Document: peter_camenzind.txt
the_glass_bead_game.txt:0.0031
steppenwolf.txt:0.0005
gertrude.txt:0.0005
Document: rosshalde.txt
the_glass_bead_game.txt:0.0038
steppenwolf.txt:0.0009
siddhartha.txt:0.0008
Document: siddhartha.txt
demian.txt:0.0905
the_journey_to_the_e