<a href="https://colab.research.google.com/github/BryceRodgers7/CoTorch/blob/main/DuneBookTopics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# STEP 1: download RAW DATA
# begin by downloading the first 3 dune books .txt
import json
import pandas as pd
import requests

# text will usually be ~1 sentence per line, some blank lines, no page numbers or chapter titles
url = 'https://raw.githubusercontent.com/ganesh-k13/shell/master/test_search/www.glozman.com/TextPages/Frank%20Herbert%20-%20Dune.txt'
response = requests.get(url)
lines = []
if response.status_code == 200:
    # Split the text into lines and remove any leading or trailing whitespace
    lines = response.text.splitlines()
    print(f'downloaded {len(lines)} lines')
else:
    print('Failed to download file')

print('\n\nFinished STEP 1: Download Data')

downloaded 8608 lines


Finished STEP 1: Download Data


In [30]:
# STEP 2: construct RAW DOCUMENTS
# Several options on what to consider a document for training & testing purposes.. all 3 books together as well as each book on its own. 7100 unique words.
#   First book's TF matrix is either 21 chapters, or ~3000 lines/sentences

# re-combine the lines into chapter-sized documents
def combine_lines_into_chapters(lines, chapter_indexes):
    # note this has a hard-coded starting-line of '1' for the first iteration
    # so multiple corpuses need to remove the 0th document to be distinct
    chapters = []
    start = 1
    for index in chapter_indexes:
        chapter = " ".join(lines[start-1:index-1])
        chapters.append(chapter)
        start = index
    return chapters

def print_chapters(chapters):
  for i, chapter in enumerate(chapters):
      print(f"Chapter {i}:")
      print(chapter)
      print()


# chapter start lines (manually derived):
chp_start_idx_bk1 = [11, 184, 310, 422, 571, 635, 729, 888, 1010, 1065, 1201, 1278, 1514, 1601, 1651, 2031, 2604, 2780, 2832, 3043, 3252]
chp_start_idx_bk2 = [3260, 3334, 3539, 3762, 3976, 4226, 4351, 4510, 4606, 4784, 4907, 5101, 5318, 5616, 5765, 6010 ]
chp_start_idx_bk3 = [6018, 6162, 6279, 6460, 6621, 6743, 7021, 7236, 7396, 7552, 7689, 8055 ]
chp_start_idx_all_bks = chp_start_idx_bk1.copy()
chp_start_idx_all_bks.extend(chp_start_idx_bk2)
chp_start_idx_all_bks.extend(chp_start_idx_bk3)
print(f'chp_start_idx_all_bks is {chp_start_idx_all_bks}')
print(f'chp_start_idx_all_bks length is {len(chp_start_idx_all_bks)}')

# appendix start lines
ap1_start_idx_bka = [8060]
ap2_start_idx_bka = [8136]
ap3_start_idx_bka = [8241]
ap4_start_idx_bka = [8263]
trm_start_idx_bk1 = [8282]


# create corpus for first book, combine all lines between idx into documents
dune_bk1_docs = combine_lines_into_chapters(lines, chp_start_idx_bk1)
dune_bk2_docs = combine_lines_into_chapters(lines, chp_start_idx_bk2)
dune_bk3_docs = combine_lines_into_chapters(lines, chp_start_idx_bk3)

# create corpus for initial dune trilogy
dune_trl_docs = combine_lines_into_chapters(lines, chp_start_idx_all_bks)

# Remove title page docs
print(f'removing title-page docs...')
# print(f'{dune_bk1_docs[0]}')
# print(f'{dune_bk2_docs[0]}')
# print(f'{dune_bk3_docs[0]}')
# print(f'37 is {chp_start_idx_all_bks[37]} , {dune_all_bks_docs[37]}')
# print(f'21 is {chp_start_idx_all_bks[21]} ,{dune_all_bks_docs[21]}')
# print(f'0 is {dune_all_bks_docs[0]}')
dune_bk1_docs = dune_bk1_docs[1:]
dune_bk2_docs = dune_bk2_docs[1:]
dune_bk3_docs = dune_bk3_docs[1:]
del dune_trl_docs[37]
del dune_trl_docs[21]
del dune_trl_docs[0]


# print each chapter/document on its own line
# print_chapters(dune_bk1_docs)
# print_chapters(dune_bk2_docs)
# print_chapters(dune_bk3_docs)
# print_chapters(dune_all_bks_docs)
print('\n\nFinished STEP 2: Constructing Documents')

chp_start_idx_all_bks is [11, 184, 310, 422, 571, 635, 729, 888, 1010, 1065, 1201, 1278, 1514, 1601, 1651, 2031, 2604, 2780, 2832, 3043, 3252, 3260, 3334, 3539, 3762, 3976, 4226, 4351, 4510, 4606, 4784, 4907, 5101, 5318, 5616, 5765, 6010, 6018, 6162, 6279, 6460, 6621, 6743, 7021, 7236, 7396, 7552, 7689, 8055]
chp_start_idx_all_bks length is 49
removing title-page docs...


Finished STEP 2: Constructing Documents


In [31]:
# STEP 3: sanitize CORPUS
# including the following operations: lowercase, remove punctuation,
# remove stop-words, remove non-alphanumeric, stem, lemmatize, post-process

import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the Porter stemmer and get the list of English stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
fixed_words = 0
# fix_words = {}
pre_fixwords = {'--':' ', }


fix_words = {'abruptli':'abrupt', 'accident':'accid', 'accusingli':'accus',
             'administr':'administ', 'aliaoftheknif':'alia', 'amplifi':'amplif',
             'ancestri':'ancestr', 'assumpt':'assum', 'caladanin':'caladan',
             'cinnamonheavi':'cinnamon', 'wormsign':'worm'}

def pre_fix_words(document):
  # do something with pre_fixwords
  return document

def remove_punctuations(document):
  punctuations = string.punctuation
  return document.translate(str.maketrans('', '', punctuations))

def remove_stopwords(document):
  return " ".join([word for word in document.split() if word not in stop_words])

def remove_special_characters(document):
  document = re.sub('[^a-zA-Z0-9]', ' ', document)
  # other regex subs:
  # limit spaces to 1
  document = re.sub('\s+', ' ', document)
  # replace double-dash with space
  document = re.sub('--', ' ', document)
  return document

def stem_words(document):
  return " ".join([stemmer.stem(word) for word in document.split()])

def lemmatize_words(document):
  return " ".join([lemmatizer.lemmatize(word) for word in document.split()])

def fix_words_nuke(document):
  result = ""
  for word in document.split():
    if (word in fix_words):
      result+=f' {fix_words[word]}'
    else:
      result+=f' {word}'
  return result

def preprocess_document(document):
    fixed_words = 0
    document = document.lower()
    document = remove_punctuations(document)
    document = remove_stopwords(document)
    document = remove_special_characters(document)
    document = stem_words(document)
    document = lemmatize_words(document)
    processed_document = fix_words_nuke(document)
    print(f' .. done! Fixed {fixed_words} words in this document')

    return processed_document

def preprocess_corpus(corpus, name):
    preprocessed_corpus = []
    for i, document in enumerate(corpus):
        print(f'preprocessing corpus {name} document {i+1}..', end = '')
        preprocessed_document = preprocess_document(document)
        preprocessed_corpus.append(preprocessed_document)
    print(f'finished preprocessing {name}!\n\n')
    return preprocessed_corpus


# Preprocess Dune Book 1 corpus
dune_bk1_preprocessed = preprocess_corpus(dune_bk1_docs, 'dune book 1')

# Preprocess Dune Book 1 corpus
dune_bk2_preprocessed = preprocess_corpus(dune_bk2_docs, 'dune book 2')

# Preprocess Dune Book 1 corpus
dune_bk3_preprocessed = preprocess_corpus(dune_bk3_docs, 'dune book 3')

# Preprocess Dune Trilogy corpus
dune_trl_preprocessed = preprocess_corpus(dune_trl_docs, 'dune trilogy')

# Print dune book 1 preprocessed corpus
for i, document in enumerate(dune_bk1_preprocessed):
    print(f"Document {i+1}: {document}")

print('\n\nFinished STEP 3: Sanitize Data')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


preprocessing corpus dune book 1 document 1.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 2.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 3.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 4.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 5.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 6.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 7.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 8.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 9.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 10.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 11.. .. done! Fixed 0 words in this document
preprocessing corpus dune book 1 document 12.. .. do

In [33]:
# STEP 4: create TDF MATRIX
import numpy as np


def create_tdf_matrix(preprocessed_corpus):

  # create the vocabulary too
  vocabulary = sorted(set(word for doc in preprocessed_corpus for word in doc.split()))

  # start with empty matrix
  tdf_matrix = np.zeros((len(preprocessed_corpus), len(vocabulary)), dtype=int)

  # scan corpus, fill matrix
  for doc_idx, doc in enumerate(preprocessed_corpus):
      for term in doc.split():
          term_idx = vocabulary.index(term)
          tdf_matrix[doc_idx, term_idx] += 1

  # Print the tdf matrix
  print(f'dtf matrix is {tdf_matrix.shape[0]} x {tdf_matrix.shape[1]}:')

  # below is OPTIONAL output to view the post-processed corpus
  # highlight possible duplicates, manually verify atomicity of data
  # print('\n\nhighlighting possible duplicates:\n')
  # lastWord = 'abc'
  # dups = 0
  # for word in vocabulary:
  #   if (lastWord in word):
  #     dups+=1
  #     print(f'{lastWord} is a subset of {word}')
  #   lastWord = word

  # print(f'num possible dups is {dups}')
  # # alter the matrix to combine like-terms

  # # print full vocabulary to see terms near the actual duplicates
  # print('\n\n\n********* FULL VOCABULARY *********\n\n\n')
  # i = 0
  # for word in vocabulary:
  #   if i%8==0:
  #     print(f'{word} ')
  #   else:
  #     print(f'{word} ', end='')
  #   i+=1

  # return [matrix, vocabulary]
  return [tdf_matrix, vocabulary]


dune_bk1_tdf_result = create_tdf_matrix(dune_bk1_preprocessed)
dune_bk1_tdf_matrix = dune_bk1_tdf_result[0]
dune_bk1_vocab = dune_bk1_tdf_result[1]

dune_bk2_tdf_result = create_tdf_matrix(dune_bk2_preprocessed)
dune_bk2_tdf_matrix = dune_bk2_tdf_result[0]
dune_bk2_vocab = dune_bk2_tdf_result[1]

dune_bk3_tdf_result = create_tdf_matrix(dune_bk3_preprocessed)
dune_bk3_tdf_matrix = dune_bk3_tdf_result[0]
dune_bk3_vocab = dune_bk3_tdf_result[1]

dune_trl_tdf_result = create_tdf_matrix(dune_trl_preprocessed)
dune_trl_tdf_matrix = dune_trl_tdf_result[0]
dune_trl_vocab = dune_trl_tdf_result[1]

print('Finished Step 4: Create TDF Matrix')

dtf matrix is 20 x 5466:
dtf matrix is 15 x 4878:
dtf matrix is 11 x 3748:
dtf matrix is 46 x 8153:


In [42]:
# STEP 5: create TF-IDF MATRIX
# get top scores for each document

# input is a numpy array where rows represent terms and columns represent documents
def create_tfidf_matrix_from_tdf_matrix(tdf_matrix):

  # Compute TF matrix (Term Frequency)
  tf_matrix = tdf_matrix / np.sum(tdf_matrix, axis=1, keepdims=True)

  # Compute IDF marix (Inverse Document Frequency)
  num_documents = tdf_matrix.shape[0]
  idf_vector = np.log(num_documents / np.count_nonzero(tdf_matrix, axis=0))

  tfidf_matrix = tf_matrix * idf_vector

  return tfidf_matrix

# get a list of the top 10 words for each document (2d array)
def get_top_tfidf_scores(tfidf_matrix, vocabulary):
  top_scores = []
  for doc_idx in range(tfidf_matrix.shape[0]):
    doc_top_scores = []
    # non-zero term count
    nztc = 0
    doc_tfidf_scores = tfidf_matrix[doc_idx].copy()
    for i in doc_tfidf_scores:
      if not(i == 0):
        nztc+=1
    print(f'\n\n\ndocument {doc_idx} has {nztc} non-zero terms')

    # Find indices of the 10 largest values
    largest_indices = np.argpartition(doc_tfidf_scores, -10)[-10:]
    print(f'indices for largest scores are {largest_indices}')

    # Get the 10 largest values
    largest_values = doc_tfidf_scores[largest_indices]
    print(f'largest scores themselves are ... {largest_values}')

    for idx in largest_indices:
      print(f'{vocabulary[idx]}, ', end = '')
      doc_top_scores.append(vocabulary[idx])
    top_scores.append(doc_top_scores)
  return top_scores

def compare_top_scores(scores1, scores2, offset):
  for i, scores in enumerate(scores1):
    delta = list(set(scores) ^ set(scores2[i+offset]))
    if (len(delta) > 0):
      print(f'\ndocument {i} has delta {delta}')
      print(f'individual book scores were {scores}')
      print(f'trilogy scores were {scores2[i+offset]}')
    else:
      print(f'\n *** document {i} has same top scores in both corpuses *** ')
      print(f'they were {scores2[i+offset]}')

# create tfidf_matrix for all books plus the trilogy
# also create a smaller matrix holding the top 10 words in each document
dune_bk1_tfidf_matrix = create_tfidf_matrix_from_tdf_matrix(dune_bk1_tdf_matrix)
dune_bk1_top_scores = get_top_tfidf_scores(dune_bk1_tfidf_matrix, dune_bk1_vocab)

dune_bk2_tfidf_matrix = create_tfidf_matrix_from_tdf_matrix(dune_bk2_tdf_matrix)
dune_bk2_top_scores = get_top_tfidf_scores(dune_bk2_tfidf_matrix, dune_bk2_vocab)

dune_bk3_tfidf_matrix = create_tfidf_matrix_from_tdf_matrix(dune_bk3_tdf_matrix)
dune_bk3_top_scores = get_top_tfidf_scores(dune_bk3_tfidf_matrix, dune_bk3_vocab)

dune_trl_tfidf_matrix = create_tfidf_matrix_from_tdf_matrix(dune_trl_tdf_matrix)
dune_trl_top_scores = get_top_tfidf_scores(dune_trl_tfidf_matrix, dune_trl_vocab)

# compare top scores from bk1 to trilogy bk1
print('\n\n\nDune BOOK 1 top scores comparison:')
compare_top_scores(dune_bk1_top_scores, dune_trl_top_scores, 0)

# compare top scores from bk2 to trilogy bk2
print('\n\n\nDune BOOK 2 top scores comparison:')
offset = len(dune_bk1_top_scores)
compare_top_scores(dune_bk2_top_scores, dune_trl_top_scores, offset)


# compare top scores from bk3 to trilogy bk3
print('\n\n\ndune book 3 top scores comparison:')
offset += len(dune_bk2_top_scores)
compare_top_scores(dune_bk3_top_scores, dune_trl_top_scores, offset)


print('\n\nFinished STEP 5: Create TF-IDF Matrix\n\n')




document 0 has 933 non-zero terms
indices for largest scores are [3097 3858 5381 3857 2571 2326 3317  513 2038 2559]
largest scores themselves are ... [0.00396086 0.00397131 0.00585273 0.00608881 0.00424426 0.00421569
 0.00633457 0.01116282 0.01980431 0.01980431]
needl, reverend, withdraw, rever, jessica, human, pain, box, gom, jabbar, 


document 1 has 809 non-zero terms
indices for largest scores are [3935 4430  264 3105 2020 5031 1725 1727 3434  329]
largest scores themselves are ... [0.00462217 0.00516209 0.00557384 0.01032418 0.01190308 0.01376557
 0.01548627 0.02512873 0.05823928 0.03256407]
rumbl, spin, attempt, nephew, globe, uncl, feyd, feydrautha, piter, baron, 


document 2 has 553 non-zero terms
indices for largest scores are [5416 4755 4043 5353 5143 2001 3480 2571 1354 3858]
largest scores themselves are ... [0.00597557 0.00656443 0.00777439 0.00777439 0.00777439 0.01392247
 0.00656443 0.0101819  0.00998966 0.00839452]
worth, tear, seagul, willow, usul, girl, poem, jes

In [None]:
# STEP 5 (alternate): create TF-IDF MATRIX from corpus


from sklearn.feature_extraction.text import TfidfVectorizer

# return both tfidf matrix and the vectorizer
def createTfidfMatrix(corpus):
  # Create TfidfVectorizer
  vectorizer = TfidfVectorizer()

  # Fit the vectorizer to the corpus and transform the corpus into TF-IDF matrix
  tfidf_matrix = vectorizer.fit_transform(corpus)

  return [tfidf_matrix, vectorizer]

# tfidf_vectorizer[0] = matrix, [1] = vectorizer
def printTopTerms(tfidf_vectorizer):
  tfidf_matrix = tfidf_vectorizer[0]
  vectorizer = tfidf_vectorizer[1]
  corpus_top_scores = []

  # print the TF-IDF scores for each row/document
  for doc_idx in range(tfidf_matrix.shape[0]):
    doc_tfidf_scores = tfidf_matrix[doc_idx].toarray()[0]

    # Get feature names (words) from the vectorizer
    feature_names = vectorizer.get_feature_names_out()

    # Create a dictionary to store feature names and their corresponding TF-IDF scores
    feature_tfidf_dict = dict(zip(feature_names, doc_tfidf_scores))

    # Sort the dictionary by TF-IDF scores in descending order
    sorted_features = sorted(feature_tfidf_dict.items(), key=lambda x: x[1], reverse=True)


    top_n = 10

    # Print the top N features with their TF-IDF scores
    # print(f"\n\n*** Top {top_n} TF-IDF scores in (document) chapter {doc_idx+1} ***")

    doc_top_scores = []
    for feature, score in sorted_features[:top_n]:
        # print(f"{feature}: {score}")
        doc_top_scores.append(feature)
    corpus_top_scores.append(doc_top_scores)
  return corpus_top_scores

print('creating dune book 1 tfidf matrix..', end='')
# create TF-IDF matrix for Dune book 1 corpus
# dune_bk1_corpus = dune_bk1_preprocessed.copy()
dune_bk1_tfidf_vectorizer = createTfidfMatrix(dune_bk1_preprocessed)
# print book1 tfidf scores
dune_bk1_top_scores = printTopTerms(dune_bk1_tfidf_vectorizer)
print('complete')

print('creating dune book 2 tfidf matrix..', end='')
# create TF-IDF matrix for Dune book 2 corpus
# dune_bk2_corpus = dune_bk2_preprocessed.copy()
dune_bk2_tfidf_vectorizer = createTfidfMatrix(dune_bk2_preprocessed)
# print book1 tfidf scores
dune_bk2_top_scores = printTopTerms(dune_bk2_tfidf_vectorizer)

print('creating dune book 3 tfidf matrix..', end='')
# create TF-IDF matrix for Dune book 3 corpus
# dune_bk3_corpus = dune_bk3_preprocessed.copy()
dune_bk3_tfidf_vectorizer = createTfidfMatrix(dune_bk3_preprocessed)
# print book1 tfidf scores
dune_bk3_top_scores = printTopTerms(dune_bk3_tfidf_vectorizer)

print('creating dune Trilogy tfidf matrix..', end='')
# create TF-IDF matrix for Dune Trilogy corpus
# dune_all_bks_corpus = dune_all_bks_preprocessed.copy()
dune_all_bks_tfidf_vectorizer = createTfidfMatrix(dune_trl_preprocessed)
# print book1 tfidf scores
dune_all_bks_top_scores = printTopTerms(dune_all_bks_tfidf_vectorizer)

print('\n\n\ndune book 1 top scores comparison:')
# compare top scores from bk1 to bk1 in 'all_bks' top scores
for i, scores in enumerate(dune_bk1_top_scores):
  delta = list(set(scores) ^ set(dune_all_bks_top_scores[i]))
  if (len(delta) > 0):
    print(f'\ndocument {i} has delta {delta}')
    print(f'book 1 scores were {scores}')
    print(f'trilogy scores were {dune_all_bks_top_scores[i]}')
  else:
    print(f'\ndocument {i} has same top scores in both corpuses')

offset = len(dune_bk1_top_scores)
print('\n\n\ndune book 2 top scores comparison:')
# compare top scores from bk1 to bk1 in 'all_bks' top scores
for i, scores in enumerate(dune_bk2_top_scores):
  delta = list(set(scores) ^ set(dune_all_bks_top_scores[i+offset]))
  if (len(delta) > 0):
    print(f'\ndocument {i} has delta {delta}')
    print(f'book 2 scores were {scores}')
    print(f'trilogy scores were {dune_all_bks_top_scores[i]}')
  else:
    print(f'\ndocument {i} has same top scores in both corpuses')

offset += len(dune_bk2_top_scores)
print('\n\n\ndune book 3 top scores comparison:')
# compare top scores from bk1 to bk1 in 'all_bks' top scores
for i, scores in enumerate(dune_bk3_top_scores):
  delta = list(set(scores) ^ set(dune_all_bks_top_scores[i+offset]))
  if (len(delta) > 0):
    print(f'\ndocument {i} has delta {delta}')
    print(f'book 3 scores were {scores}')
    print(f'trilogy scores were {dune_all_bks_top_scores[i]}')
  else:
    print(f'\ndocument {i} has same top scores in both corpuses')

print('\n\nFinished STEP 4: Create TF-IDF Matrix\n\n')

'''
BOOK 1
ANALYSIS


I have seen the movie, but I have not read the books.
This is my attempt to link well-known scenes in the movie,
to chapters in book 1, by analyzing all 20 chapters' top TF-IDF scores.


document 0 (chapter 1) is clearly the 'gom jabbar' chapter, the top 10 include:
  'gom' 'jabbar' 'paul' 'old' 'woman' 'box'...
  'gom' and 'jabbar' are not in any other top-10

document 3 is clearly the 'shield-training' chapter, its top 10 include:
  'paul' 'halleck' 'rapier' 'gurney' 'table' 'mood' 'lad'...
  'mood' and 'rapier' are not in any other top-10
  only one other chapter has a weapon (word) in its top-10

document 8 is clearly the hunter-seeker attack chapter, its top 10 words include:
  'paul' 'room' 'headboard' 'hunterseek(er)' 'bed' 'seeker' 'oper(ator)'
  'headboard' and 'hunerseek(er)' and 'seeker' never show in any other top-10.

document 15 is the sabotaged spice-harvester chapter, its top 10 words include:
  'kyne(s)' 'duke' 'halleck' 'crawler' 'worm' 'paul' 'sand'
  this is the first chapter where 'worm' or 'crawler' or 'sand' are in the top 10


Nearly every chapter's top-10 included multiple character names along with 'said'
A good take-away is that - like the movie - the book is filled with dialogue and not 'action'

Although spice is vital to understanding Dune, no chapter or scene is 'about' spice.
'spice' and 'melange' are never in the top-10,
this is because it is not a physical 'subject' in any chapter.
In the movie it appears as minor dialogue,
plus one 'filmbook' scene (the most prominent of the three).
  There are 3 'filmbook' scenes in the movie,
  the shortest being ~10 seconds and the longest ~1 minute.

Thus if we look granularly enough,
we should be able to find a document where 'spice' scores high on TF-IDF!

'''

creating dune book 1 tfidf matrix..complete
creating dune book 2 tfidf matrix..creating dune book 3 tfidf matrix..creating dune Trilogy tfidf matrix..


dune book 1 top scores comparison:

document 0 has delta ['hand', 'pain']
book 1 scores were ['gom', 'jabbar', 'paul', 'mother', 'said', 'jessica', 'hand', 'old', 'woman', 'box']
trilogy scores were ['paul', 'said', 'mother', 'gom', 'jabbar', 'pain', 'old', 'jessica', 'box', 'woman']

document 1 has delta ['duke', 'nephew']
book 1 scores were ['piter', 'baron', 'feydrautha', 'said', 'feyd', 'uncl', 'globe', 'mentat', 'nephew', 'know']
trilogy scores were ['piter', 'baron', 'said', 'feydrautha', 'feyd', 'globe', 'mentat', 'uncl', 'duke', 'know']

document 2 has delta ['tell', 'poem']
book 1 scores were ['jessica', 'girl', 'mother', 'reverend', 'paul', 'old', 'dream', 'woman', 'said', 'tell']
trilogy scores were ['jessica', 'girl', 'paul', 'reverend', 'old', 'mother', 'woman', 'dream', 'said', 'poem']

document 3 has same top scores in b

"\nBOOK 1\nANALYSIS\n\n\nI have seen the movie, but I have not read the books.\nThis is my attempt to link well-known scenes in the movie,\nto chapters in book 1, by analyzing all 20 chapters' top TF-IDF scores.\n\n\ndocument 0 (chapter 1) is clearly the 'gom jabbar' chapter, the top 10 include:\n  'gom' 'jabbar' 'paul' 'old' 'woman' 'box'...\n  'gom' and 'jabbar' are not in any other top-10\n\ndocument 3 is clearly the 'shield-training' chapter, its top 10 include:\n  'paul' 'halleck' 'rapier' 'gurney' 'table' 'mood' 'lad'...\n  'mood' and 'rapier' are not in any other top-10\n  only one other chapter has a weapon (word) in its top-10\n\ndocument 8 is clearly the hunter-seeker attack chapter, its top 10 words include:\n  'paul' 'room' 'headboard' 'hunterseek(er)' 'bed' 'seeker' 'oper(ator)'\n  'headboard' and 'hunerseek(er)' and 'seeker' never show in any other top-10.\n\ndocument 15 is the sabotaged spice-harvester chapter, its top 10 words include:\n  'kyne(s)' 'duke' 'halleck' 'cra

In [None]:
# STEP 6: PERFORM KMEANS TOPIC ANALYSIS

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dune_bk1_preprocessed)

# K-means clustering
k = 4  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(tfidf_matrix)

# Get the top terms for each cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"Cluster {i + 1}:")
    top_terms = [terms[ind] for ind in order_centroids[i, :5]]  # Top 5 terms
    print(", ".join(top_terms))

In [None]:
# HELPER STEP: SAVE MATRIX TO A FILE AND DOWNLOAD

from IPython.display import HTML
import base64

def create_download_link( df, title = "Download CSV file", filename = "data.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df)