### Import packages

In [None]:
import os
import json
import string
import pandas as pd
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Create documents corpus

In [None]:
corpus_dir = "./Literature-original"
corpus = PlaintextCorpusReader(corpus_dir, ".*\.txt") # type: ignore
files_names = corpus.fileids()
files_names

### Corpus documents preprocessing

In [None]:
documents = {}
for file_name in files_names:
    documents[file_name] = corpus.raw(file_name)
print(json.dumps(documents, indent=4, ensure_ascii=False))

In [None]:
lengths = {}
for file_name in documents:
    lengths[file_name] = {
        "pre": len(word_tokenize(documents[file_name]))
    }
print(json.dumps(lengths, indent=4, ensure_ascii=False))

In [None]:
ps = PorterStemmer()

In [None]:
for file_name in documents:
    documents[file_name] = documents[file_name].lower()
    documents[file_name] = "".join(char for char in documents[file_name] if char not in string.punctuation)
    documents[file_name] = "".join(char for char in documents[file_name] if not char.isdigit())
    documents[file_name] = " ".join(ps.stem(word) for word in word_tokenize(documents[file_name]))
    documents[file_name] = " ".join(word for word in word_tokenize(documents[file_name]) if word not in list(stopwords.words('english')))
print(json.dumps(documents, indent=4, ensure_ascii=False))

In [None]:
for file_name in documents:
    lengths[file_name]['post'] = len(word_tokenize(documents[file_name]))
print(json.dumps(lengths, indent=4, ensure_ascii=False))

In [None]:
lengths = pd.DataFrame.from_dict(lengths, orient='index')

In [None]:
lengths['diff'] = lengths['pre']-lengths['post']
lengths['pct'] = lengths['diff']/lengths['pre']
lengths


Create frequency matrix

In [None]:
docs = pd.DataFrame.from_dict(documents, orient="index")
docs.columns = ['content']
docs

In [None]:
cv = CountVectorizer()
matrix_tf = cv.fit_transform(docs['content'])
matrix_tf

In [None]:
tv = TfidfVectorizer()
matrix_tfidf = tv.fit_transform(docs['content'])
matrix_tfidf

In [None]:
sparsity_tf = 1-(matrix_tf.getnnz()/(matrix_tf.shape[0]*matrix_tf.shape[1]))
sparsity_tf

Directories for results

In [None]:
if not os.path.exists("./wordclouds"):
    os.mkdir("./wordclouds")
if not os.path.exists("./topic_modelling"):
    os.mkdir("./topic_modelling")
if not os.path.exists("./topic_modelling/topics"):
    os.mkdir("./topic_modelling/topics")
if not os.path.exists("./topic_modelling/documents"):
    os.mkdir("./topic_modelling/documents")
if not os.path.exists("./clustering"):
    os.mkdir("./clustering")
if not os.path.exists("./ngrams"):
    os.mkdir("./ngrams")

Wordclouds

In [None]:
wordcloud = WordCloud(
    background_color="white",
    max_words=5000,
    contour_width=3,
    contour_color="stealblue"

)

In [23]:
for index, row in docs.iterrows():
    wordcloud.generate(row['content'])
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(index.replace(".txt", ""))
    plt.savefig('./wordclouds/{}'.format(index.replace("txt","png")))
    plt.close()

Topic modelling

In [24]:
def plot_top_words(model, feature_names, n_top_words, title, size):
    colors = ['forestgreen', 'palevioletred', 'darkcyan', 'plum','darkkhaki','indianred','darkseagreen','tan','maroon','darkorchid' ]
    fig, axes = plt.subplots(*size, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7, color=colors[topic_idx])
        ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig(f"./topic_modelling/topics/{title}.png")
    plt.close()


In [25]:
def plot_documents(model, matrix, n_topics, title):
    colors = ['forestgreen', 'palevioletred', 'darkcyan', 'plum','darkkhaki','indianred','darkseagreen','tan','maroon','darkorchid' ]
    docs_topics = pd.DataFrame(model.transform(matrix), columns=[f"Topic {x}" for x in range(n_topics)])
    docs_topics.index = [file_name.replace(".txt", "") for file_name in files_names]
    plt.Figure(figsize=(7,4))
    left = [0] *len(docs_topics)
    for i, col in enumerate(docs_topics.columns):
        plt.barh(docs_topics.index, docs_topics[col], left=left, label=col, color= colors[i])
        left =[left[j]+docs_topics[col].iloc[j] for j in range(len(docs_topics))]
    plt.savefig(f"./topic_modelling/documents/{title}.png")
    plt.close()



In [26]:
n_topics = 10
features_names = cv.get_feature_names_out()
n_top_words = 20
size = (2,5)

Clustering

N-grams