In [1]:
##########################################################################
# Author: Christopher Thomas Goodwin
# Creation Date: 2024.05.16
# Summary: Uses sklearn to explore the data of the NSHWE Stimmungs- und Lageberichte files
#          using TF-IDF and then K-Means clustering
##########################################################################

In [7]:
#### Get all file names

from pathlib import Path

files = [] # holds file paths of all text files

for file in Path("../../data/text").glob("*.txt"):
    files.append(file.parent / file.name)

n_files = len(files)
print(n_files) # 750
print(files[0]) # should have full path

750
../../data/text/1943.02.04 - Meldungen aus dem Reich Cleaned.txt


In [10]:
#### convert all text files into strings
docs = [] # holds string version of each file (i.e. each text file converted into one string and stored here

for file in files:
    with open(file) as f:
        contents = f.read()
    docs.append(contents)
    f.close()

In [17]:
#### run TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

german_stop_words = stopwords.words('german')
additional_stop_words = ["volk", "volksgemeinschaft", "1939", "1940", "1941", "1942", "1943", "1944", "1945", "deutsch", "bevölkerung", "ii", "iii", "iv", "v", "vi", "einzelmeldungen", "volksgenossen", "sei", "seien", "worden", "meldungen", "deutsche", "deutschen", "wegen", "wurde", "gif", "pro", "kg", "minusbox", "images", "rm"]

for i in range(0, 1946):
    additional_stop_words.append(str(i))

german_stop_words.extend(additional_stop_words)

vectorizer = TfidfVectorizer(max_df=.90, min_df=5, stop_words=german_stop_words, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(docs)
transformed_documents_array = transformed_documents.toarray()
print(len(transformed_documents_array)) # should match the number above to make sure we transformed all documents



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cgoodwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


750


In [25]:
#### output TF-IDF scores
import pandas as pd

Path("data/tf-idf_output").mkdir(parents=True, exist_ok=True) # make directory if it doesn't already exist

output_filenames = [str(file).replace(".txt", ".csv").replace("../../data/text", "data/tf-idf_output") for file in files]

for counter, doc in enumerate(transformed_documents_array):
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    
    one_doc_as_df.to_csv(output_filenames[counter])


data/tf-idf_output/1942.09.25 - Meldungen aus dem Reich Cleaned.csv
