In [1]:
#1st step: Preparing the data

import requests
import string
import pandas as pd

# A Popular History of the Art of Music
book_url = 'https://www.gutenberg.org/cache/epub/20293/pg20293.txt'
response = requests.get(book_url)
book1 = response.text
allowed_chars = string.ascii_letters + string.digits + string.whitespace
book1 = ''.join(c for c in book1 if c in allowed_chars)

# Famous composers and their works
book_url = 'https://www.gutenberg.org/cache/epub/57570/pg57570.txt'
response = requests.get(book_url)
book2 = response.text
allowed_chars = string.ascii_letters + string.digits + string.whitespace
book2 = ''.join(c for c in book2 if c in allowed_chars)

# Chopin and Other Musical Essays
book_url = 'https://www.gutenberg.org/cache/epub/18560/pg18560.txt'
response = requests.get(book_url)
book3 = response.text
allowed_chars = string.ascii_letters + string.digits + string.whitespace
book3 = ''.join(c for c in book3 if c in allowed_chars)

# Life of Mozart
book_url = 'https://www.gutenberg.org/cache/epub/43411/pg43411.txt'
response = requests.get(book_url)
book4 = response.text
allowed_chars = string.ascii_letters + string.digits + string.whitespace
book4 = ''.join(c for c in book4 if c in allowed_chars)

# Story-Lives of Great Musicians
book_url = 'https://www.gutenberg.org/cache/epub/19748/pg19748.txt'
response = requests.get(book_url)
book5 = response.text
allowed_chars = string.ascii_letters + string.digits + string.whitespace
book5 = ''.join(c for c in book5 if c in allowed_chars)


# Create our dataframes
book1_lines = book1.splitlines()

book1_df = pd.DataFrame({
    "line": book1_lines,
    "line_number": list(range(len(book1_lines)))
})

book2_lines = book2.splitlines()

book2_df = pd.DataFrame({
    "line": book2_lines,
    "line_number": list(range(len(book2_lines)))
})

book3_lines = book3.splitlines()

book3_df = pd.DataFrame({
    "line": book3_lines,
    "line_number": list(range(len(book3_lines)))
})

book4_lines = book4.splitlines()

book4_df = pd.DataFrame({
    "line": book4_lines,
    "line_number": list(range(len(book4_lines)))
})

book5_lines = book5.splitlines()

book5_df = pd.DataFrame({
    "line": book5_lines,
    "line_number": list(range(len(book5_lines)))
})

# We’ll want to know which content comes from which book
book1_df = book1_df.assign(book = 'A Popular History of the Art of Music')
book2_df = book2_df.assign(book = 'Famous composers and their works')
book3_df = book3_df.assign(book = 'Chopin and Other Musical Essays')
book4_df = book4_df.assign(book = 'Life of Mozart')
book5_df = book5_df.assign(book = 'Story-Lives of Great Musicians')

# Finally, we concatenate the books into one dataframe
books = [book1_df, book2_df, book3_df, book4_df, book5_df]
music_books_df = pd.concat(books)
music_books_df.head()

Unnamed: 0,line,line_number,book
0,The Project Gutenberg eBook of A Popular Histo...,0,A Popular History of the Art of Music
1,,1,A Popular History of the Art of Music
2,This ebook is for the use of anyone anywhere i...,2,A Popular History of the Art of Music
3,most other parts of the world at no cost and w...,3,A Popular History of the Art of Music
4,whatsoever You may copy it give it away or reu...,4,A Popular History of the Art of Music


In [2]:
# We split the data into words
# We first split the text column into a list of words
music_books_df['word'] = music_books_df['line'].str.split()

# Explode the words column to create a new row for each word (this creates a separate row for each word from the newly created words list)
music_books_df = music_books_df.explode('word')

# Reset the index of the dataframe (we want to index each word now)
music_books_df = music_books_df.reset_index(drop=True)

# For our investigations the line & line_number columns will not be necessary, so we will remove them
music_books_df = music_books_df[['book', 'word']]
music_books_df

Unnamed: 0,book,word
0,A Popular History of the Art of Music,The
1,A Popular History of the Art of Music,Project
2,A Popular History of the Art of Music,Gutenberg
3,A Popular History of the Art of Music,eBook
4,A Popular History of the Art of Music,of
...,...,...
609823,Story-Lives of Great Musicians,about
609824,Story-Lives of Great Musicians,new
609825,Story-Lives of Great Musicians,eBooks
609826,Story-Lives of Great Musicians,


In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

music_books_df = music_books_df[~music_books_df['word'].str.lower().isin(stopwords.words('english'))]
#music_books_df = bronte_books_df[~bronte_books_df['Word 2'].str.lower().isin(stopwords.words('english'))]
music_books_df=music_books_df.dropna()
music_books_df.head(20)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,book,word
1,A Popular History of the Art of Music,Project
2,A Popular History of the Art of Music,Gutenberg
3,A Popular History of the Art of Music,eBook
6,A Popular History of the Art of Music,Popular
7,A Popular History of the Art of Music,History
10,A Popular History of the Art of Music,Art
12,A Popular History of the Art of Music,Music
15,A Popular History of the Art of Music,ebook
19,A Popular History of the Art of Music,use
21,A Popular History of the Art of Music,anyone


In [4]:
# Let's count the occurrences of each word - this is a prerequisite for finding term frequency
count_df = music_books_df.groupby('word')['word'].count() # Group by word column, then only keep the word column and perform the counting

# Let's sort by term frequency
count_df_sorted = count_df.sort_values(ascending=False)

music_books_df.groupby(['word', 'book']).size().sort_values(ascending=False).reset_index(name='count')

Unnamed: 0,word,book,count
0,music,A Popular History of the Art of Music,678
1,p,Life of Mozart,666
2,Footnote,Life of Mozart,627
3,Mozart,Life of Mozart,597
4,music,Famous composers and their works,525
...,...,...,...
61043,invariable,A Popular History of the Art of Music,1
61044,invaluable,Story-Lives of Great Musicians,1
61045,Trio,Life of Mozart,1
61046,invalidity,Story-Lives of Great Musicians,1


In [5]:
count_df = music_books_df.groupby('word').agg({'book': 'first', 'word': 'count'})

count_df = count_df.rename(columns={'word': 'count'})

# Sorting values based on count column
count_df.sort_values('count', ascending=False)
count_df_1 = music_books_df.groupby(['word', 'book']).size().sort_values(ascending=False).reset_index(name='count') # How many appearances each word has in each book

count_df_2 = music_books_df.groupby(['book']).size().sort_values(ascending=False).reset_index(name='count') # How many words each book has


book_words = count_df_1.merge(count_df_2, on='book')

book_words = book_words.rename(columns={'count_x': 'word_appearances_in_book', 'count_y': 'book_total_word_count'}) # Give more meaningful names

book_words=book_words.assign(tf=book_words['word_appearances_in_book'] / book_words['book_total_word_count'])

book_words['rank'] = book_words.groupby('book')['word_appearances_in_book'].rank(ascending=False, method='dense')

import math
N=4

book_words = book_words.assign(idf=book_words.groupby(['word'])['book'].transform(lambda x:  math.log(N/len(x))))

book_words=book_words.assign(tf_idf=book_words['idf']*book_words['tf'])

book_words.tail()

Unnamed: 0,word,book,word_appearances_in_book,book_total_word_count,tf,rank,idf,tf_idf
61043,Triumph,Story-Lives of Great Musicians,1,52281,1.9e-05,119.0,0.287682,6e-06
61044,Triumphal,Story-Lives of Great Musicians,1,52281,1.9e-05,119.0,1.386294,2.7e-05
61045,Trieste,Story-Lives of Great Musicians,1,52281,1.9e-05,119.0,0.693147,1.3e-05
61046,invaluable,Story-Lives of Great Musicians,1,52281,1.9e-05,119.0,0.693147,1.3e-05
61047,invalidity,Story-Lives of Great Musicians,1,52281,1.9e-05,119.0,-0.223144,-4e-06


In [8]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
text = "I love this movie!"
sentiment = sia.polarity_scores(text)
print(sentiment)

{'neg': 0.0, 'neu': 0.308, 'pos': 0.692, 'compound': 0.6696}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [12]:
import re
book1_lines = book1.splitlines()

book1_df = pd.DataFrame({
    "line": book1_lines,
    "line_number": list(range(len(book1_lines)))
})

# We create a helper function to identify the chapter of each line
def line_is_chapter(dataframe):
    chapter_list = []
    curr_chapter = 0
    for index, row in dataframe.iterrows():
        if re.search("^chapter [\\divxlc]*$", row['line'], re.IGNORECASE):
            curr_chapter += 1
        chapter_list.append(curr_chapter)
    return chapter_list

curr_chapter = 0
# We add the chapter to our dataframe
book1_df = book1_df.assign(chapter = line_is_chapter(book1_df))
book1_df.head(1110)

# We will now transform our dataframe into one that has one word per row

# We first split the text column into a list of words
book1_df['word'] = book1_df['line'].str.split()

# Explode the words column to create a new row for each word (this creates a separate row for each word from the newly created words list)
book1_df = book1_df.explode('word')

# Reset the index of the dataframe (we want to index each word now)
book1_df = book1_df.reset_index(drop=True)
book1_df[10000:10010]

Unnamed: 0,line,line_number,chapter,word
10000,would justify were not the range of subjects i...,1553,3,subjects
10001,would justify were not the range of subjects i...,1553,3,indispensable
10002,would justify were not the range of subjects i...,1553,3,to
10003,would justify were not the range of subjects i...,1553,3,even
10004,would justify were not the range of subjects i...,1553,3,a
10005,summarized treatment of musical history so wid...,1554,3,summarized
10006,summarized treatment of musical history so wid...,1554,3,treatment
10007,summarized treatment of musical history so wid...,1554,3,of
10008,summarized treatment of musical history so wid...,1554,3,musical
10009,summarized treatment of musical history so wid...,1554,3,history


In [13]:
# Create list of nltk_sentiments
list_nltk_sentiments = ['positive' if sia.polarity_scores(str(word))['compound'] > 0 else 'negative' if sia.polarity_scores(str(word))['compound'] < 0 else 'neutral' for word in book1_df['word']]

In [14]:
#Perform sentiment analysis-NLTK VADER
!pip install plotly


import numpy as np
import plotly.graph_objects as go

# Convert sentiment values to numeric values
book1_df = book1_df.assign(nltk_sentiment = list_nltk_sentiments)

book1_df['sentiment_value'] = book1_df['nltk_sentiment'].map({'positive': 1, 'negative': -1, 'neutral': 0})
non_neutral_df = book1_df[book1_df['sentiment_value'] != 0]

# Calculate the average sentiment per chapter
avg_sentiment_per_chapter = non_neutral_df.groupby('chapter')['sentiment_value'].mean().reset_index()

# Create the interactive line plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=avg_sentiment_per_chapter['chapter'],
                         y=avg_sentiment_per_chapter['sentiment_value'],
                         mode='lines+markers',
                         name='Average Sentiment'))

fig.update_layout(title='Average Sentiment per Chapter',
                  xaxis_title='Chapter',
                  yaxis_title='Average Sentiment',
                  hovermode='x')

fig.show()

