In [1]:
import fitz 
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import spacy
from textblob import TextBlob

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = []
    
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text.append((page_num+1,page.get_text()))
    
    return text

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [4]:
pdf_files = [f'Harry Potter/HP{i}.pdf' for i in range(1, 8)]

rows = []

for book_number, pdf_file in enumerate(pdf_files, start=1):
    text_data = extract_text_from_pdf(pdf_file)
    for page_num, page_text in text_data:
        sentences = sent_tokenize(page_text)  # Split text into sentences
        for sentence in sentences:
            rows.append({'Book_Number': book_number, 'Page_Number': page_num, 'Sentence_Text': sentence})

df = pd.DataFrame(rows)

df['Processed_Text'] = df['Sentence_Text'].apply(preprocess_text)

df.head()

Unnamed: 0,Book_Number,Page_Number,Sentence_Text,Processed_Text
0,1,5,\n \n \n \n \n \n \n \n \nHarry Potter \nAnd ...,"[harry, potter, sorcerer, ’, stone]"
1,1,6,\n \n \n \n \n \n \n \n \nALSO BY J. K. ROWLI...,"[also, j., k., rowling, harry, potter, sorcere..."
2,1,7,\nHarry Potter \nand the Sorcerer’s Stone \n ...,"[harry, potter, sorcerer, ’, stone, j., k., ro..."
3,1,8,"For Jessica, who loves stories \nfor Anne, who...","[jessica, ,, loves, stories, anne, ,, loved, ;..."
4,1,8,Text copyright © 1997 by J.K. Rowling \nIllust...,"[text, copyright, ©, 1997, j.k., rowling, illu..."


In [5]:
df.to_csv('flask/data/processed_text.csv')

In [6]:
nlp = spacy.load('en_core_web_sm')

# df = pd.read_csv('processed_text.csv', index_col=0)

In [7]:
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

df['Entities'] = df['Sentence_Text'].apply(extract_entities)

df.head()

Unnamed: 0,Book_Number,Page_Number,Sentence_Text,Processed_Text,Entities
0,1,5,\n \n \n \n \n \n \n \n \nHarry Potter \nAnd ...,"[harry, potter, sorcerer, ’, stone]","[(Harry Potter, PERSON), (Sorcerer’s Stone, ORG)]"
1,1,6,\n \n \n \n \n \n \n \n \nALSO BY J. K. ROWLI...,"[also, j., k., rowling, harry, potter, sorcere...","[(J. K. ROWLING, PERSON), (Harry Potter, PERSO..."
2,1,7,\nHarry Potter \nand the Sorcerer’s Stone \n ...,"[harry, potter, sorcerer, ’, stone, j., k., ro...","[(Harry Potter, PERSON), (the Sorcerer’s Stone..."
3,1,8,"For Jessica, who loves stories \nfor Anne, who...","[jessica, ,, loves, stories, anne, ,, loved, ;...","[(Jessica, PRODUCT), (Anne, PERSON), (Di, ORG)]"
4,1,8,Text copyright © 1997 by J.K. Rowling \nIllust...,"[text, copyright, ©, 1997, j.k., rowling, illu...","[(1997, DATE), (J.K. Rowling, ORG), (Mary Gran..."


In [8]:
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df['Sentiment'] = df['Sentence_Text'].apply(get_sentiment)

# Split the sentiment into separate columns
df[['Polarity', 'Subjectivity']] = pd.DataFrame(df['Sentiment'].tolist(), index=df.index)


df.head()

Unnamed: 0,Book_Number,Page_Number,Sentence_Text,Processed_Text,Entities,Sentiment,Polarity,Subjectivity
0,1,5,\n \n \n \n \n \n \n \n \nHarry Potter \nAnd ...,"[harry, potter, sorcerer, ’, stone]","[(Harry Potter, PERSON), (Sorcerer’s Stone, ORG)]","(0.0, 0.0)",0.0,0.0
1,1,6,\n \n \n \n \n \n \n \n \nALSO BY J. K. ROWLI...,"[also, j., k., rowling, harry, potter, sorcere...","[(J. K. ROWLING, PERSON), (Harry Potter, PERSO...","(0.0, 0.0)",0.0,0.0
2,1,7,\nHarry Potter \nand the Sorcerer’s Stone \n ...,"[harry, potter, sorcerer, ’, stone, j., k., ro...","[(Harry Potter, PERSON), (the Sorcerer’s Stone...","(0.0, 0.0)",0.0,0.0
3,1,8,"For Jessica, who loves stories \nfor Anne, who...","[jessica, ,, loves, stories, anne, ,, loved, ;...","[(Jessica, PRODUCT), (Anne, PERSON), (Di, ORG)]","(0.475, 0.5666666666666667)",0.475,0.566667
4,1,8,Text copyright © 1997 by J.K. Rowling \nIllust...,"[text, copyright, ©, 1997, j.k., rowling, illu...","[(1997, DATE), (J.K. Rowling, ORG), (Mary Gran...","(0.0, 0.0)",0.0,0.0


In [9]:
df_exploded = df.explode('Entities')
df_exploded.head()

Unnamed: 0,Book_Number,Page_Number,Sentence_Text,Processed_Text,Entities,Sentiment,Polarity,Subjectivity
0,1,5,\n \n \n \n \n \n \n \n \nHarry Potter \nAnd ...,"[harry, potter, sorcerer, ’, stone]","(Harry Potter, PERSON)","(0.0, 0.0)",0.0,0.0
0,1,5,\n \n \n \n \n \n \n \n \nHarry Potter \nAnd ...,"[harry, potter, sorcerer, ’, stone]","(Sorcerer’s Stone, ORG)","(0.0, 0.0)",0.0,0.0
1,1,6,\n \n \n \n \n \n \n \n \nALSO BY J. K. ROWLI...,"[also, j., k., rowling, harry, potter, sorcere...","(J. K. ROWLING, PERSON)","(0.0, 0.0)",0.0,0.0
1,1,6,\n \n \n \n \n \n \n \n \nALSO BY J. K. ROWLI...,"[also, j., k., rowling, harry, potter, sorcere...","(Harry Potter, PERSON)","(0.0, 0.0)",0.0,0.0
1,1,6,\n \n \n \n \n \n \n \n \nALSO BY J. K. ROWLI...,"[also, j., k., rowling, harry, potter, sorcere...","(Harry Potter, PERSON)","(0.0, 0.0)",0.0,0.0


In [10]:
df_exploded.to_csv('flask/data/final_data.csv')

In [11]:
df_without_text = df_exploded.drop(['Sentence_Text','Processed_Text'], axis=1)
df_without_text.head()

Unnamed: 0,Book_Number,Page_Number,Entities,Sentiment,Polarity,Subjectivity
0,1,5,"(Harry Potter, PERSON)","(0.0, 0.0)",0.0,0.0
0,1,5,"(Sorcerer’s Stone, ORG)","(0.0, 0.0)",0.0,0.0
1,1,6,"(J. K. ROWLING, PERSON)","(0.0, 0.0)",0.0,0.0
1,1,6,"(Harry Potter, PERSON)","(0.0, 0.0)",0.0,0.0
1,1,6,"(Harry Potter, PERSON)","(0.0, 0.0)",0.0,0.0


In [12]:
df_without_text.to_csv('flask/data/final_data_without_text.csv')

: 