<a href="https://colab.research.google.com/github/GVSU-CIS635/term-project-proposal-plato/blob/main/CIS635_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Thematic and Argument Structure Analysis of Plato’s Republic Using Data Mining Techniques

Tanishq Daniel, Trevor Ouma, Nate Miller

## **Where is the .txt file, is it stored locally on your machine or you uploaded it here?**


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [45]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# TODO remove introductory content, stop words and punctiation that are not relevant
def preprocess_text(text_data):
    documents = text_data.split('\n\n')
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    cleaned_texts = []
    for document in documents:
        tokens = word_tokenize(document.lower())
        cleaned_tokens = [word for word in tokens if word not in stop_words and word not in punctuation]
        cleaned_texts.append(' '.join(cleaned_tokens))

    return cleaned_texts

In [57]:
#TODO remove introductory content, stop words and punctiation that are not relevant but data is split by "books"
def preprocess_text_by_book(text_data):
    stop_words = set(stopwords.words('english'))

    document_books = text_data.split("BOOK")
    relevant_books = document_books[12:22]

    books_data = []
    for i, book_content in enumerate(relevant_books[:10], 1):
        book_content = re.sub(r'\b[MCDXLVI]+\b', '', book_content)
        book_content = re.sub(r'[^\w\s]', '', book_content).lower()
        book_data = {
            "book_number": i,
            "content": book_content.strip()
        }
        books_data.append(book_data)

    return books_data

In [58]:
books_data = preprocess_text_by_book(text_data)

for book in books_data:
    print(f"BOOK {book['book_number']} Content Preview:\n{book['content'][:200]}...\n")

BOOK 1 Content Preview:
went down yesterday to the piraeus with glaucon the son of ariston
that  might offer up my prayers to the goddess bendis the thracian
artemis and also because  wanted to see in what manner they would
...

BOOK 2 Content Preview:
with these words  was thinking that  had made an end of the
discussion but the end in truth proved to be only a beginning for
glaucon who is always the most pugnacious of men was dissatisfied at
thras...

BOOK 3 Content Preview:
such then  said are our principles of theologysome tales are to be
told and others are not to be told to our disciples from their youth
upwards if we mean them to honour the gods and their parents and...

BOOK 4 Content Preview:
here adeimantus interposed a question how would you answer socrates
said he if a person were to say that you are making these people
miserable and that they are the cause of their own unhappiness the
...

BOOK 5 Content Preview:
such is the good and true city or state and the good and tru

In [61]:
#Tokenize the words
def token_text_by_book(text_data):
    stop_words = set(stopwords.words('english'))

    document_books = text_data.split("BOOK")
    relevant_books = document_books[12:22]

    books_data_token = []
    for i, book_content in enumerate(relevant_books, 1):
        book_content = re.sub(r'\b[MCDXLVI]+\b', '', book_content)
        book_content = re.sub(r'[^\w\s]', '', book_content).lower()
        tokens = word_tokenize(book_content)
        clean_tokens = [word for word in tokens if word not in stop_words]
        book_data_token = {
            "book_number": i,
            "clean_tokens": clean_tokens
        }
        books_data_token.append(book_data_token)

    return books_data_token

In [62]:
token_data = token_text_by_book(text_data)

for book in token_data:
    print(f"BOOK {book['book_number']} Clean Tokens Preview:\n{book['clean_tokens'][:20]}...\n")

BOOK 1 Clean Tokens Preview:
['went', 'yesterday', 'piraeus', 'glaucon', 'son', 'ariston', 'might', 'offer', 'prayers', 'goddess', 'bendis', 'thracian', 'artemis', 'also', 'wanted', 'see', 'manner', 'would', 'celebrate', 'festival']...

BOOK 2 Clean Tokens Preview:
['words', 'thinking', 'made', 'end', 'discussion', 'end', 'truth', 'proved', 'beginning', 'glaucon', 'always', 'pugnacious', 'men', 'dissatisfied', 'thrasymachus', 'retirement', 'wanted', 'battle', 'said', 'socrates']...

BOOK 3 Clean Tokens Preview:
['said', 'principles', 'theologysome', 'tales', 'told', 'others', 'told', 'disciples', 'youth', 'upwards', 'mean', 'honour', 'gods', 'parents', 'value', 'friendship', 'one', 'another', 'yes', 'think']...

BOOK 4 Clean Tokens Preview:
['adeimantus', 'interposed', 'question', 'would', 'answer', 'socrates', 'said', 'person', 'say', 'making', 'people', 'miserable', 'cause', 'unhappiness', 'city', 'fact', 'belongs', 'none', 'better', 'whereas']...

BOOK 5 Clean Tokens Preview:
['good

In [None]:
# TODO tune hyperparameters, separate topics into themes and subthemes, return topics and paragraphs with high distribution
def extract_themes(text):
  # Create Document-Term Matrix with adjusted min_df and max_df
  vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
  term_matrix = vectorizer.fit_transform(text)

  # Fit LDA
  lda = LatentDirichletAllocation(n_components=5, random_state=42)
  lda.fit(term_matrix)

  # Display Topics
  terms = vectorizer.get_feature_names_out()
  def display_topics(model, feature_names, no_top_words):
      for topic_idx, topic in enumerate(model.components_):
          print(f"Topic {topic_idx}:")
          print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
  display_topics(lda, terms, 5)

  # Document-Topic Distribution
  distributions = lda.transform(term_matrix)
  for idx, distribution in enumerate(distributions):
      print(f"Document {idx} topic distribution: {distribution}")

In [None]:
import spacy
import pandas as pd
import re

nlp = spacy.load("en_core_web_sm")

conclusion_keywords = {"therefore", "thus", "hence", "consequently", "in conclusion"}
premise_keywords = {"because", "since", "for", "as", "insofar as", "inasmuch as"}

def extract_arguments(documents):
    arguments = []

    for doc in documents:
        processed_doc = nlp(doc)

        premises = []
        conclusions = []

        for sent in processed_doc.sents:
            sentence_text = sent.text.lower()

            if any(keyword in sentence_text for keyword in conclusion_keywords):
                conclusions.append(sentence_text)

            elif any(keyword in sentence_text for keyword in premise_keywords):
                premises.append(sentence_text)

        if premises or conclusions:
            arguments.append({
                "text": doc,
                "premises": premises,
                "conclusions": conclusions
            })

    argument_df = pd.DataFrame(arguments)
    print(argument_df.head(10))


In [None]:
def calculate_theme_complexity():
  print('calculate theme complexity')

In [None]:
def calculate_argument_complexity():
  print('calculate argument complexity')

In [None]:
def calculate_complexity():
  print('calculate complexity')

In [None]:
def calculate_correlation():
  print('calculate correlation')

In [None]:
def visualization():
  print('visualization')

In [None]:
def main(text_data):
    # Step 1: Preprocess the text
    documents = preprocess_text(text_data)

    # Step 2: Detect Themes
    extract_themes(documents)

    # Step 3: Detect Arguments
    extract_arguments(documents)

    # Step 4: Calculate Theme Complexity
    calculate_theme_complexity()

    # Step 5: Calculate Argument Complexity
    calculate_argument_complexity()

    # Step 6: Correlation Analysis
    calculate_correlation()

    # Step 7: Visualization
    visualization()

In [14]:
from urllib.request import urlopen

data = urlopen('https://raw.githubusercontent.com/GVSU-CIS635/Datasets/refs/heads/master/republic.txt')
html_response = data.read()
encoding = data.headers.get_content_charset('utf-8')
text_data = html_response.decode(encoding)

#main(text_data)
