# HARRY POTTER TEXT MINING DIY

In [None]:
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
from wordcloud import WordCloud
from string import punctuation
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer # Sentiment analysis
import pprint # to print dictionaries
from nrclex import NRCLex

In [None]:
from func.proc import read_hp

book1 = read_hp('Book1.txt')
book2 = read_hp('Book2.txt')
book3 = read_hp('Book3.txt')
book4 = read_hp('Book4.txt')
book5 = read_hp('Book5.txt')
book6 = read_hp('Book6.txt')
book7 = read_hp('Book7.txt')

I collect all the books together and transform to the lower case

In [None]:
books = book1 + book2 + book3 + book4 + book5 + book6 + book7
books = books.lower()

I start managing my text:
1. create my own tokenizer:
    1. consider all the words
    2. consider common 

from this list I want to keep only the words without considering the punctuation and the numbers

In [None]:
from func.proc import CustomTokenizer

custom_tokenizer = CustomTokenizer()
hp_tokens = custom_tokenizer.tokenize(books)
hp_tokens = [i[0] for i in hp_tokens if i[1] == 'WORD']

now that i have the tokens from the books, i'm going to delete the stop words (taken from a common dict and defined by us using some frequences statics.

In [None]:
# stop_words.txt = stopwords tidytext from R
with open("stop_words.txt", 'r') as file:
    hp_stop_w = [words.strip() for words in file.readlines() if not (words.startswith("Page |") or words.strip() == '')]

hp_tokens_sw = [i for i in hp_tokens if i not in hp_stop_w]
hp_tokens_sw = [word for word in hp_tokens_sw if not word.startswith("'")]

In [None]:
hp_counter = Counter(hp_tokens_sw)

N = 45

plt.figure(figsize=(15, 3))

# plt.subplot(121)
plt.title("{} Most frequent words in the Harry Potter series".format(N))
plt.bar(*zip(*hp_counter.most_common(N)), color="gold")
plt.xticks(rotation="vertical")



plt.show()

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(hp_tokens_sw))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from PIL import Image
MASK = np.array(Image.open("Sorting_Hat.png"))
MAX_WORDS = 200
MAX_FONT_SIZE = 500
RELATIVE_SCALING = 0.7

hp = WordCloud(
    width=500, 
    height=300,
    mask = MASK,
    max_words = MAX_WORDS, 
    background_color = "white",
    max_font_size = MAX_FONT_SIZE,
    relative_scaling = RELATIVE_SCALING,
).generate_from_frequencies(hp_counter)

plt.figure(figsize=(10, 5))
plt.imshow(hp, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
stemmer = SnowballStemmer("english")
hp_stem = [stemmer.stem(token) for token in hp_tokens_sw]

## ORA LO FACCIAMO PER LIBRO 

In [None]:
hp_books = [book1, book2, book3, book4, book5, book6, book7]

In [None]:
from func.proc import preproc

# Assuming you are in a loop
books_hp_token = {}  # Dictionary to store the lists

for i in range(len(hp_books)):
    book_name = f"book_{i+1}"
    books_hp_token[book_name] = preproc(hp_books[i], custom_tokenizer, stop_words = hp_stop_w)

books_hp_stem = {}
for i in range(len(hp_books)):
    book_name = f"book_{i+1}"
    books_hp_stem[book_name] = [stemmer.stem(token) for token in books_hp_token[book_name]]

## PLATONE

### SPIEGONE 

Inizialmente partiamo prendendo le 400 parole più frequenti all'interno del primo e del settimo libro (che saranno gli estremi del nostro ordinamento di libri) e di queste parole, terremo quelle che sono presenti anche negli altri 5 libri eliminando le parole che ai fini dell'analisi non ci sembrano rilevanti.

Dentro `books_hp_stem` abbiamo gli stemmi per ciascun libro, ora ci creiamo un dizionario che conti le parole del primo e del settimo

In [None]:
x = Counter(books_hp_stem['book_1'])
book11 = dict(sorted(x.items(),key=lambda item: item[1], reverse=True))
y = Counter(books_hp_stem['book_7'])
book77 = dict(sorted(y.items(),key=lambda item: item[1], reverse=True))

Ora quindi ci teniamo le prime 400 parole dal primo e dal settimo libro che sono quelli che mi definiranno l'ordinamento dei libri

In [None]:
# take the most common stems that are in book 1 and book 7
book11 = dict(list(book11.items())[:400])
book77 = dict(list(book77.items())[:400])
most_comm_17 = list(set(book11.keys()) & set(book77.keys()))

In [None]:
most_comm_tot = most_comm_17.copy()

In [None]:
#keep the most common stems that are also in the other books
for i in range(len(books_hp_stem)):  # Replace 5 with the desired number of iterations
    book_name = f"book_{i+1}"
    most_comm_tot = [i for i in most_comm_tot if i in books_hp_stem[book_name]]

We consider two initial population $W_1$ = *Harry Potter and the Phylosopher stone* and $W_7$ = *Harry Potter and the deathly hallows*. Let $n_{i1}$ the frequence of the *i*-th stem for *i* = 1, $\dots$, $N_{stems}$ in book 1 and let $N_1 = \sum_{i}n_{i1}$ and $N_7$ the corrispondent sum for $W_7$. We indicate with $\theta_{1i}$, for $i = 1, \dots, p$ the probability that the i-th word is in $W_1$ and $\theta_{7i}$ for $i = 1, \dots, p$ the same quantity for $W_7$. The probability to observe a sample from $W_1$ and $W_7$ follows a multinomial distribution. Using the log likelihood ratio test to compare the two samples we get the expression:

$$ \sum_{i=1}^{N_{stem}} n_i\log\frac{\theta_{7i}}{\theta_{1i}}$$

We can see that every word has a sort of score, like:

$$ s_i = \log{\frac{\theta_{7i}}{\theta_{1i}}}$$



In [None]:
filtered_dict = {key: value for key, value in Counter(books_hp_stem['book_1']).items() if key in most_comm_tot}
xx = sorted(filtered_dict.items(),key=lambda item: item[1], reverse=True)
df_book = pd.DataFrame(xx, columns = ['Stems', 'Count_1'])
df_book['Freq_1'] = df_book['Count_1']/df_book['Count_1'].sum()
df_book.head()

filtered_dict = {key: value for key, value in Counter(books_hp_stem['book_7']).items() if key in most_comm_tot}
xx = dict(sorted(filtered_dict.items(),key=lambda item: item[1], reverse=True))
df_book['Count_7'] = df_book['Stems'].map(xx)
df_book['Freq_7'] = df_book['Stems'].map(xx)/df_book['Stems'].map(xx).sum()
df_book.head()


Now the idea is to extend the analysis to the other books, which represent new populations to be classified, and use the total score of each book as a discriminant function. Thus let $W_k$ with $k = 2,3,4,5,6$ be the populations that represent the books from 2 to 6. For these populations, the same results apply as for $W_1$ to $W_7$. So for each $W_k$ with $k = 2,3,4,5,6,7$ we can calculate a measure that represents the comprehensive score assigned to each $W_k$. In particular for each $W_k$ where $N_k = \sum_i n_{ik}$ one can calculate the average score

$$
\bar{s}_k = \frac{1}{N_k} \sum_{i=1}^{N_{stem}} n_{i} \log \frac{\theta_{1i}}{\theta_{7i}} = \sum_{i=1}^{N_{stem}} n_{i}s_i \quad i = 1, ..., N_{stem}.
$$

To calculate the average scores we need to estimate the parameter vectors $\theta_1$ and $\theta_7$. Since both $N_1$ and $N_7$ are high, the probabilities $\hat{\theta}_{i1}$ and $\hat{\theta}_{i7}$ can be estimated with the corresponding observed frequencies. So we find for all five books to be classified an associated score that represents the positioning of that book. This measure can be interpreted in relative terms to understand which are the furthest books and which are the closest. To make inferences and evaluate the significance of the results obtained, we can also define the variance of $s_k$, for which an unbiased estimate is


$$
\hat{V}(\bar{s}_k) = \frac{1}{N_k(N_k - 1)} \left( \sum_{i=1}^{N_{stem}} n_i s_i^2 - \frac{1}{N_k} \left( \sum_{i=1}^{N_{stem}} n_i s_i \right)^2 \right).
$$

Given that $N_k$ is large, $s_k - s_k'$ will be approximately normally distributed with variance equal to the sum of the corresponding variances; therefore, to test the significance, we can use the usual $t$-test.


In [None]:
from func.proc import hp_count

# with this loop we add a column for each of the book with their own freq and counts
for i in range(2, 7):
    book_idx = f"book_{i}"
    col_count, col_freq = hp_count(book_idx, books_hp_stem, most_comm_tot, df_book)
    df_book[f"Count_{i}"] = np.array(col_count)
    df_book[f"Freq_{i}"]= np.array(col_freq)

df_book.head()

In [None]:
from func.proc import hp_scores

scores = {}
for i in range(2, 7):
    scores[f"book_{i}"] = {'score': hp_scores(df_book, i)}
#aggiunta
    

sorted_scores = sorted(scores.items(), key=lambda item: item[1]['score'], reverse = True)
scores = {k: v for k, v in sorted_scores}

scores

In [None]:
from func.proc import hp_var
for i in range(2, 7):
    scores[f"book_{i}"]['Variance'] = hp_var(df_book, i)
    scores[f"book_{i}"]['N'] = sum(df_book[f"Count_{i}"])
scores

### t-test

In [None]:
from scipy import stats
from func.proc import t_test

t_statistic, p_value = t_test(scores["book_2"], scores["book_6"])

print("t-statistic:", t_statistic)
print("p-value:", p_value)

In [None]:
ttest = {}
for i in range(2,7):
    for j in range(i+1,7):
        ttest[f"test_{i}_{j}"] = {'t-stat': round(t_test(scores[f"book_{i}"], scores[f"book_{j}"])[0],3), 
                              'pval': round(t_test(scores[f"book_{i}"], scores[f"book_{j}"])[1], 3)}
ttest
        

Abbiamo detto che ci sta che i libri siano ordinati un po' a cazzum, infatti questo ordinamento è stato fatto utilizzando parole senza nessun criterio. Probabilmente l’insieme di parole scelto e molto generico e, considerando anche la vastit`a dei testi analizzati, vengono colti specifici costrutti lessicali, che non sono pero' legati alla trama e allo stile dei vari libri, ma sono il risultato dell’elevata dimensione dei libri.

Inoltre il numero di parole scelto per l’analisi potrebbe essere troppo esiguo in relazione alla dimensione del problema.
Abbiamo quindi replicato l’analisi dando una direzione piu specifica al problema, cioè assumendo l’ipotesi di un’evoluzione dello stile narrativo dell’autrice che ci ha portato a svolgere questo progetto. Partendo da diversi elenchi di parole categorizzate da dizionari ontologici come indicative di sentimenti negativi (come paura, tristezza, violenza) abbiamo selezionato manualmente all’interno di W1 e W7 le piu` frequenti, anche in relazione al contesto della saga. Dopo aver tenuto solamente le parole presenti in tutti e sette i libri

## SIMILARITY 

We start from the stemmed words matrix:

In [None]:
documents = [" ".join(words) for words in books_hp_stem.values()]
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(documents)

count_array = count_matrix.toarray()

freq_matrix = np.divide(count_array, np.sum(count_array, axis=1, keepdims=True))

# feature_names = vectorizer.get_feature_names_out()

In [None]:
documents = [" ".join(words) for words in books_hp_stem.values()]

vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(documents)
count_array = count_matrix.toarray()

In [None]:
count_array.shape

In [None]:
np.sum(count_array, axis = 1)

In [None]:
len(books_hp_stem['book_1'])

In [None]:
from func.proc import cosine, centroidi, kmeans

k_centroids = count_array[[0,6]]

In [None]:
np.random.shuffle(count_array)
gruppi_KM = kmeans(count_array,k_centroids)
gruppi_KM

In [None]:
gruppi_KM

Alex devi cercare di capire a che libri appartengono i di ciascun gruppo (sperando che in ogni caso venga fuori 1,2,3 e 4,5,6,7

# Spectral Clustering

To perform spectral clustering we first have to represent data as a graph, with vertices and edges, represented in the form $\mathcal{G} = \{V, E\}$. To do so an intuitive way is to use as vertices all the observation (books) and as weights to connect them the distance between each couple of observation. In particular we will use the cosine distance $d(w_i, w_j)$ already defined. In this way we get that all the vertices are connected with weights given by the inputs of matrix W, where
$$
W = w_{ij}\quad\text{with}\quad w_{ij} = d(v_i, v_j).
$$
Moreover, we need to define the degree matrix 
$$
D = diag(d_i)
$$ 
with all empty off-diagonal entries, whereas the diagonal contains the degree of each node, which is the number of edges incident on it. We will refer to it as 
$$
d_i = \sum_{j = 1}^{n}w_{ij}.
$$

It is now necessary to define the Graph Laplacian $L = D - W$ and normalize it as folows: 
$$
L_{\text{sym}}:=D^{-\frac{1}{2}}LD^{-\frac{1}{2}} = I - D^{-\frac{1}{2}}WD^{-\frac{1}{2}}
$$

### Normalized Spectral clustering according to Ng, Jordan, and Weiss (2002)
Input: weight matrix $W\in \mathbb{R}^{n\times n}$, number $k$ of clusters to construct
* Compute the normalized Laplacian $L_{\text{sym}}$.
* Compute the first $k$ eigenvectors $u_1, \dots , u_k$ of $L_{\text{sym}}$ correspondent to the smallest $k$ eigenvalues.
* Let $U\in \mathbb{R}^{n\times k}$ be the matrix containing the vectors $u_1, \dots , u_k$ of $L_{\text{sym}}$ as columns.
* Form the matrix $T\in \mathbb{R}^{n\times k}$ from $U$ by normalizing the rows to norm 1, that is set $t_{ij} = u_{ij}/(\sum_k u_{ik}^2)^{1/2}$
* For $i = 1, \dots, n$, let $y_i \in\mathbb(R){k}$ be the vector corresponding to the $i$-th row of $T$.
* Cluster the points $(y_i)_{i = 1, \dots, n}$ with the $k$-means algorithm into clusters $C_1, \dots, C_k$.

Output: Clusters $A_1, \dots, A_k$ with $A_i = \{j\,|\,y_j\in C_i\}$.

In [None]:
from func.proc import spectral_cl

In [None]:
spectral_cl(count_array, 2)

# SENTIMENT ANALYSIS

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
book_names = list()
for i in range(len(hp_books)): 
    book_names.append(f"book_{i+1}")
print(book_names)

In [None]:
scores = dict()
for b in book_names:
    new_b = " ".join(books_hp_token[b])
    scores[b] = analyzer.polarity_scores(new_b)
    print(b)

In [None]:
sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1]["neg"]))
for key in sorted_scores:
    print(key + ": ")
    print(sorted_scores[key])
    print()

In [None]:
sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1]["neu"]))
for key in sorted_scores:
    print(key + ": ")
    print(sorted_scores[key])
    print()

As we can see the books are perfectly ordered for the negative score, which means that in each book, even though there isn't a worse feeling with year after year, we can find a growing presence of you know who and of the death eaters, sometimes compensated by positive feelings and experiences of the main characters

In [None]:
book_emotions = dict()
keys = ["fear", "anger", "anticip", "trust", "surprise", "positive", "negative", "sadness", "disgust", "joy", "anticipation"]
for b in book_names:
    print(b)
    emotion = dict()
    for k in keys:
        emotion[k] = 0    
    for t in books_hp_token[b]:
        e = NRCLex(t).affect_frequencies
        for k in e.keys():
            emotion[k] += e[k]
    book_emotions[b] = emotion
print("done")

In [None]:
em_stand = dict()
for k1, e in book_emotions.items():
    em_stand[k1] = dict()
    tot_em = sum(e.values())
    for k2 in e.keys():
        if k2 != "anticip":
            em_stand[k1][k2] = e[k2] / tot_em
            
print(em_stand["book_1"].keys())

In [None]:
# trust, disgust
# watch out: negative doesn't give such a good order as with vader
sorted_emotions = dict(sorted(em_stand.items(), key=lambda item: item[1]["trust"]))
print(sorted_emotions.keys())

In [None]:
print(em_stand)