First things first, we install and import all the packages that we are going to need later.

In [None]:
%%capture
!pip install nltk
!pip install numpy
!pip install scipy
!pip install --upgrade gensim
!pip install wordcloud
!pip install pdfminer.six
!pip install sklearn


In [None]:
%%capture
import pandas as pd
import os
import re
from wordcloud import WordCloud
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora
from gensim import models
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import sys
import io
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import pickle 
import numpy as np
import matplotlib.pyplot as plt
import six

Sometimes it is not possible to retrieve a .txt of a literary text. The function below convert a .pdf in a .txt (you may lose some formatting info in the process, but it should not be an issue for our task)

In [None]:
def pdfparser(data):
    fp = open("filepath", 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()

    print(data)
    f = open("outputname.txt", "a")
    f.write(data)
    f.close()
if __name__ == '__main__':
    pdfparser(sys.argv[1])  

Here we transform the .txt files of the authors we are interested in, into a Python-readable form i.e. string. We do so for all the authors and then we merge the final strings in larger cluster (in this case, ancient, modern and contemporary philosophy). 
Since the raw text is not pretty useful for use, we use nltk to do some basic cleaning. The function below tokenizes the raw text and get rid of some custom stopwords (e.g. the instances of the name "Socrates", which appears almost on every page of any Plato's work).
We use Wordcloud to have a first glance of the textual information that we currently have.

In [None]:
Plato = ["/content/Plato Fedone.txt", "/content/Plato Teeteto.txt", "/content/plato menone.txt"]
Aristotele = ["/content/aristotele categorie.txt"]

Descartes = ["/content/descartes discorso sul metodo.txt", "/content/descartes principi di filosofia.txt"]
Hume = ["/content/hume trattato sulla natura umana.txt"]
Kant = ["/content/kant critica ragion pura.txt", "/content/kant prolegomeni.txt"]

Feldman = ["/content/Earl Conee, Richard Feldman - Evidentialism_ Essays in Epistemology.txt"]
Williamson = ["/content/Timothy Williamson - Knowledge and Its Limits.txt", "/content/Timothy Williamson - Tetralogue_ I'm Right, You're Wrong.txt"]
Sosa = ["/content/Ernest Sosa - A Virtue Epistemology I.txt", "/content/Ernest Sosa - A Virtue Epistemology II.txt", "/content/Ernest Sosa - Knowledge in Perspective.txt", "/content/Laurence BonJour, Ernest Sosa - Epistemic Justification.txt"]

Plato_works = ""
for text in Plato:
  f = open(text, "r")
  data = f.read()
  Plato_works = Plato_works + data

Kant_works = ""
for text in Kant:
  f = open(text, "r")
  data = f.read()
  Kant_works = Kant_works + data

Williamson_works = ""
for text in Williamson:
  f = open(text, "r")
  data = f.read()
  Williamson_works = Williamson_works + data

Sosa_works = ""
for text in Sosa:
  f = open(text, "r")
  data = f.read()
  Sosa_works = Sosa_works + data

Feldman_works = ""
for text in Feldman:
  f = open(text, "r")
  data = f.read()
  Feldman_works = Feldman_works + data

Aritotele_works = ""
for text in Aristotele:
  f = open(text, "r")
  data = f.read()
  Aritotele_works = Aritotele_works + data

Descartes_works = ""
for text in Descartes:
  f = open(text, "r")
  data = f.read()
  Descartes_works = Descartes_works + data

Hume_works = ""
for text in Hume:
  f = open(text, "r")
  data = f.read()
  Hume_works = Hume_works + data

Antichità = Plato_works + Aritotele_works
Modernità = Descartes_works + Kant_works + Hume_works
Oggi = Williamson_works + Sosa_works + Feldman_works

In [None]:
def clear_text(text):
  stopwords = nltk.corpus.stopwords.words("english")
  stopwords.extend(['theodorus', 'simmias', 'cebes', 'protagoras', 'phaedo', 'anytus', 'roxana', 'bob', 'kp', '_i_', 'richard', 'boo', 'zac', 'tm', 'htm', 'SOCRATES:', 'socrates', 'theaetetus', 'meno', 'bob:', 'sarah','from', 'project', 're', 'edu', 'use', 'gutenberg', 'EBook', 'eBook', 'www', 'org', 'one', 'p', 'C', '1', 'c', 'www.gutenberg.org', 'E', 'F', "e", "f", 'ii', 'II'])
  tokenizer = nltk.RegexpTokenizer(r"\w+")
  words = tokenizer.tokenize(text)
  clean_text = [w for w in words if w.lower() not in stopwords]
  return clean_text


In [None]:
def word_cloud(corpus, output):
  wordcloud = WordCloud(width=1920, height=1080, background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
  wordcloud.generate(corpus)
  wordcloud.to_image()
  wordcloud.to_file(output + '_wordcloud.png')

Use nltk for processing the texts, then use wordcloud to plot the most frequent words. For a more accurate result we also create a table with the 10 most common words for each author, including their raw frequencies.

In [None]:
Antichità = " ".join(clear_text(Antichità))
Modernità = " ".join(clear_text(Modernità))
Oggi = " ".join(clear_text(Oggi))

In [None]:
word_cloud(Antichità, "antichità")
word_cloud(Modernità, "modernità")
word_cloud(Oggi, "oggi")

In [None]:
def get_common_words (text, number):
  frequency = nltk.FreqDist(clear_text(text))
  return frequency.most_common(number)

In [None]:
lista_autori = [get_common_words(Plato_works, 10), get_common_words(Aritotele_works, 10), get_common_words(Descartes_works, 10), get_common_words(Hume_works, 10), get_common_words(Kant_works, 10), get_common_words(Sosa_works, 10), get_common_words(Williamson_works, 10), get_common_words(Feldman_works, 10)]
data = dict()
data["Plato"] = lista_autori[0]
data["Aristotle"] = lista_autori[1]
data["Descartes"] = lista_autori[2]
data["Hume"] = lista_autori[3]
data["Kant"] = lista_autori[4]
data["Sosa"] = lista_autori[5]
data["Williamson"] = lista_autori[6]
data["Feldman"] = lista_autori[7]

df = pd.DataFrame(data)

df.to_excel("authors_commonwords.xlsx", index=False)

'''def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax

render_mpl_table(df, header_columns=0, col_width=4.0)'''


"def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,\n                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',\n                     bbox=[0, 0, 1, 1], header_columns=0,\n                     ax=None, **kwargs):\n    if ax is None:\n        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])\n        fig, ax = plt.subplots(figsize=size)\n        ax.axis('off')\n\n    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)\n\n    mpl_table.auto_set_font_size(False)\n    mpl_table.set_fontsize(font_size)\n\n    for k, cell in  six.iteritems(mpl_table._cells):\n        cell.set_edgecolor(edge_color)\n        if k[0] == 0 or k[1] < header_columns:\n            cell.set_text_props(weight='bold', color='w')\n            cell.set_facecolor(header_color)\n        else:\n            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])\n    return ax\n\nrender_mpl_table(

In [None]:
lista_peirodi = [get_common_words(Antichità, 15), get_common_words(Modernità, 15), get_common_words(Oggi, 15)]
data = dict()
data["Ancient"] = lista_peirodi[0]
data["Modern"] = lista_peirodi[1]
data["Contemporary"] = lista_peirodi[2]


df = pd.DataFrame(data)


def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax

render_mpl_table(df, header_columns=0, col_width=4.0)


The raw frequencies, however, are not that meaningful. For a better result, we perform the same task using TF-IDF. 

In [None]:
all_txt_files =["/content/Plato Fedone.txt", "/content/Plato Teeteto.txt", "/content/plato menone.txt", "/content/aristotele categorie.txt", "/content/descartes discorso sul metodo.txt", "/content/descartes principi di filosofia.txt", "/content/hume trattato sulla natura umana.txt", "/content/kant critica ragion pura.txt", "/content/kant prolegomeni.txt", "/content/Timothy Williamson - Knowledge and Its Limits.txt", "/content/Timothy Williamson - Tetralogue_ I'm Right, You're Wrong.txt", "/content/Ernest Sosa - A Virtue Epistemology I.txt", "/content/Ernest Sosa - A Virtue Epistemology II.txt", "/content/Ernest Sosa - Knowledge in Perspective.txt", "/content/Laurence BonJour, Ernest Sosa - Epistemic Justification.txt", "/content/Earl Conee, Richard Feldman - Evidentialism_ Essays in Epistemology.txt"]
n_files = len(all_txt_files)
all_docs = [Plato_works, Aritotele_works, Descartes_works, Hume_works, Kant_works, Sosa_works, Williamson_works, Feldman_works]

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords.extend(['gorgias', 'license', 'feldman', '2010', 'theodorus', 'simmias', 'cebes', 'protagoras', 'phaedo', 'anytus', 'roxana', 'bob', 'kp', '_i_', 'richard', 'boo', 'zac', 'tm', 'htm', 'SOCRATES:', 'socrates', 'theaetetus', 'meno', 'bob:', 'sarah','from', 'project', 're', 'edu', 'use', 'gutenberg', 'EBook', 'eBook', 'www', 'org', 'one', 'p', 'C', '1', 'c', 'www.gutenberg.org', 'E', 'F', "e", "f", 'ii', 'II'])
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=stopwords, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list


  'stop_words.' % sorted(inconsistent))


In [None]:
# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = ["Plato.csv", "Aristotle.csv", "Descartes.csv", "Hume.csv", "Kant.csv", "Sosa.csv", "Williamson.csv", "Feldman.csv"]

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # output to a csv using the enumerated value for the filename
    one_doc_as_df.to_csv(output_filenames[counter])

In [None]:
lista_autori_tfidf = ["Plato.csv", "Aristotle.csv", "Descartes.csv", "Hume.csv", "Kant.csv", "Sosa.csv", "Williamson.csv", "Feldman.csv"]
data_tfidf = dict()

for author in lista_autori_tfidf:
  df = pd.read_csv(author)
  df = df.head(10)
  value_list = df["term"].tolist()
  score_list = df["score"].tolist()
  tmp =[]
  for i, x in enumerate(value_list):
    tmp.append((x, round(score_list[i], 3)))
  author = author[:-4]
  data_tfidf[author] = tmp

df = pd.DataFrame(data_tfidf)
df.to_excel("tfidf_table.xlsx", index=False)

'''def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax

render_mpl_table(df, header_columns=0, col_width=4.0)'''

"def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,\n                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',\n                     bbox=[0, 0, 1, 1], header_columns=0,\n                     ax=None, **kwargs):\n    if ax is None:\n        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])\n        fig, ax = plt.subplots(figsize=size)\n        ax.axis('off')\n\n    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)\n\n    mpl_table.auto_set_font_size(False)\n    mpl_table.set_fontsize(font_size)\n\n    for k, cell in  six.iteritems(mpl_table._cells):\n        cell.set_edgecolor(edge_color)\n        if k[0] == 0 or k[1] < header_columns:\n            cell.set_text_props(weight='bold', color='w')\n            cell.set_facecolor(header_color)\n        else:\n            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])\n    return ax\n\nrender_mpl_table(

In [None]:
antichità = ["/content/Plato Fedone.txt", "/content/Plato Teeteto.txt", "/content/plato menone.txt", "/content/aristotele categorie.txt"]
modernità = ["/content/descartes discorso sul metodo.txt", "/content/descartes principi di filosofia.txt", "/content/hume trattato sulla natura umana.txt", "/content/kant critica ragion pura.txt", "/content/kant prolegomeni.txt"]
oggi = ["/content/Timothy Williamson - Knowledge and Its Limits.txt", "/content/Timothy Williamson - Tetralogue_ I'm Right, You're Wrong.txt", "/content/Ernest Sosa - A Virtue Epistemology I.txt", "/content/Ernest Sosa - A Virtue Epistemology II.txt", "/content/Ernest Sosa - Knowledge in Perspective.txt", "/content/Laurence BonJour, Ernest Sosa - Epistemic Justification.txt", "/content/Earl Conee, Richard Feldman - Evidentialism_ Essays in Epistemology.txt"]

Corpus = []
for text in antichità:
  f = open(text, "r")
  data = f.read()
  Corpus.append(data)

In [None]:
from collections import defaultdict
# Create a set of frequent words
stopwords = nltk.corpus.stopwords.words("english")
stopwords.extend(['us','may', ',', '.', 'theodorus', 'simmias', 'cebes', 'protagoras', 'phaedo', 'anytus', 'roxana', 'bob', 'kp', '_i_', 'richard', 'boo', 'zac', 'tm', 'htm', 'socrates:', 'socrates', 'theaetetus:', 'meno:', 'bob:', 'sarah','from', 'project', 're', 'edu', 'use', 'gutenberg', 'gutenberg-tm', 'EBook', 'eBook', 'www', 'org', 'one', 'p', 'C', '1', 'c', 'www.gutenberg.org', 'E', 'F', "e", "f", 'ii', 'II'])
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stopwords]
         for document in Corpus]

# Count word frequencies
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]


In [None]:
dictionary = corpora.Dictionary(processed_corpus)

In [None]:
dictionary.token2id
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]


In [None]:
# number of topics
num_topics = 7
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                       id2word=dictionary,
                                       num_topics=num_topics)

In [None]:
# Print the Keyword in the 10 topics
doc_lda = lda_model[bow_corpus]

In [None]:
!pip install --upgrade pandas==1.2
#we need a different version of pandas for this task

Requirement already up-to-date: pandas==1.2 in /usr/local/lib/python3.7/dist-packages (1.2.0)


In [None]:
#we need the following to plot the results
%%capture
!pip install pyldavis
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = './antichità_'+str(num_topics)
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
      pickle.dump(LDAvis_prepared, f)
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
      LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './antichità_'+ str(num_topics) +'.html')
LDAvis_prepared