In [None]:
#add text to metadata csv file as text column
import os
import pandas as pd

csv_file = "metadata of hesse project.csv"
df = pd.read_csv(csv_file)
text_folder = "plain_text_hesse"

text_data = {}
for file in os.listdir(text_folder):
    if file.endswith(".txt"):  # Ensure it's a text file
        file_path = os.path.join(text_folder, file)
        with open(file_path, "r", encoding="utf-8") as f:
            text_data[file] = f.read().strip()  # Store filename as key and content as value

# Add a new column to the dataframe by matching the filename
df["text"] = df["filename"].map(text_data).astype(str)
df["text"] = df["text"].replace("nan", "")

# Save the updated CSV file
df.to_csv("hesse_file.csv", index=False)
print("Merge complete! Updated file saved as 'hesse_file.csv'.")

In [None]:
# creat a new csv file contain the metadata and only NOUN words 
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

def extract_nouns(text):
    filtered_tokens = []
    try:
        doc = nlp(text)  # Process text with spaCy
    except ValueError:
        nlp.max_length = len(text) + 1  # Dynamically adjust max_length
        doc = nlp(text)
    for token in doc:
        if token.pos_ == 'NOUN':
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

df = pd.read_csv("hesse_file.csv")

df['filtered_text'] = df['text'].apply(extract_nouns)
df.to_csv('hesse_file_noun.csv', index =False) 

print("Processing complete! Filtered text saved in 'hesse_file_noun.csv'.")

Processing complete! Filtered text saved in 'hesse_file_noun.csv'.


In [12]:
#NMF topic model 
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

df = pd.read_csv('hesse_file_noun.csv')

print('Vectorizing the text data', end=' ', flush=True)
vectorizer = TfidfVectorizer(min_df=1, max_df=0.8, stop_words='english',token_pattern=r'\b\w{4,}\b') #remove some scam problem words
"""
For small datasets (<100 documents):
min_df=1 (removes no words) or min_df=2 (removes words that appear in only one document)
max_df=0.8 (removes extremely common words)
For medium datasets (100–10,000 documents):
min_df=0.01 (1%) → Removes very rare words
max_df=0.5 (50%) → Removes overly common words
For large datasets (>10,000 documents):
min_df=0.02 (2%) or higher
max_df=0.4 (40%) to filter frequent terms
"""
vectorized_data = vectorizer.fit_transform(df['filtered_text'])
print('done', flush=True)

print('Fitting the NMF model', end=' ', flush=True)
nmf = NMF(n_components=20, random_state=1)
doc_topic_distribution_nmf = nmf.fit_transform(vectorized_data)
print('done', flush=True)


topic_word_df = pd.DataFrame(nmf.components_, columns=vectorizer.get_feature_names_out())
for topic, topic_row in topic_word_df.iterrows():
    top_15_words = ','.join(topic_row.sort_values(ascending=False).head(15).index)
    print(f'Topic {topic}: {top_15_words}')
    
    
#save in csv file
df_topic_distribution = pd.DataFrame(doc_topic_distribution_nmf)
df_topic_distribution.to_csv("document_topic_matrix.csv", index=False)


Vectorizing the text data done
Fitting the NMF model done
Topic 0: student,pupil,conversation,subject,position,week,passion,dignity,history,circle,stage,condition,atmosphere,instruction,harmony
Topic 1: cloister,city,scholar,abbot,monk,horse,journeyman,workshop,knight,maid,forest,today,penance,priest,mund
Topic 2: teaching,ferryman,forest,goal,merchant,grove,monk,boat,donation,city,dear,ferry,offering,oneness,trademark
Topic 3: karamazoff,morality,downfall,culture,instinct,epileptic,mankind,ment,primeval,hysteria,ture,unconscious,dostoevsky,sult,pronouncement
Topic 4: east,servant,ment,goal,tion,leader,official,faith,bench,violin,accuser,judgment,document,archive,hall
Topic 5: elite,player,meditation,monastery,assignment,culture,pupil,festival,today,scholar,hierarchy,herdsman,magister,tutor,petition
Topic 6: vicar,fishing,cider,examination,headache,shoemaker,holiday,week,apple,college,aunt,press,apprentice,pupil,forest
Topic 7: tanner,hospital,roof,week,lady,coat,journeyman,tailor,cide

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

doc_topic_matrix = pd.read_csv("document_topic_matrix.csv")
metadata_df = pd.read_csv("hesse_file_noun.csv")
metadata = []
for i, row in df.iterrows():
    text_metadata = f"{row['filename']} - {row['title']}"
    metadata.append(text_metadata)

# Compute cosine similarity matrix
print("Computing cosine similarity...")
similarity_matrix = cosine_similarity(doc_topic_matrix)

# Store similarity in a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=metadata, columns=metadata)
for i, filename in enumerate(metadata):
    # Top 3 similar documents
    similar_docs = similarity_df.iloc[i].sort_values(ascending=False)[1:4] 
    print(f"\nDocument: {filename}")
    for doc, score in similar_docs.items():
        print(f"{doc}: {score:.4f}")


Computing cosine similarity...

Document: beneath_the_wheel.txt - Beneath  The Wheel
the_glass_bead_game.txt - The Glass Bead Game: 0.0066
demian.txt - Demian: 0.0013
peter_camenzind.txt - Peter Camenzind: 0.0000

Document: demian.txt - Demian
the_glass_bead_game.txt - The Glass Bead Game: 0.0153
beneath_the_wheel.txt - Beneath  The Wheel: 0.0013
peter_camenzind.txt - Peter Camenzind: 0.0000

Document: gertrude.txt - Gertrude
the_glass_bead_game.txt - The Glass Bead Game: 0.0000
demian.txt - Demian: 0.0000
beneath_the_wheel.txt - Beneath  The Wheel: 0.0000

Document: if_the_war_goes_on.txt - If The War Goes On
the_glass_bead_game.txt - The Glass Bead Game: 0.0000
steppenwolf.txt - Steppenwolf: 0.0000
rosshalde.txt - Rosshalde: 0.0000

Document: in_sight_of_chaos.txt - In  Sight of Chaos
the_glass_bead_game.txt - The Glass Bead Game: 0.0000
the_journey_to_the_east.txt - The Journey To The East: 0.0000
beneath_the_wheel.txt - Beneath  The Wheel: 0.0000

Document: knulp.txt - Knulp
peter_

'# Save to CSV for further analysis\nsimilarity_df.to_csv("document_similarity_matrix.csv")\nprint("Cosine similarity analysis complete! Saved as \'document_similarity_matrix.csv\'.")'