# Importing Modules

In [28]:
import nltk
import re
import pandas as pd
import csv
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from nltk import word_tokenize, sent_tokenize
import json

In [29]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

In [30]:
# install the pdfminer.six library
!pip install pdfminer.six




# Defined function to extract the text from PDF

In [31]:
from pdfminer.high_level import extract_text
import os

def extract_text_from_pdf(pdf_path, output_txt_path):
    # Check if the PDF file exists
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"The file {pdf_path} does not exist.")

    # Extract text from the PDF
    text = extract_text(pdf_path)

    # Write the extracted text to a text file
    with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

    print(f"Text extracted from {pdf_path} and saved to {output_txt_path}")


pdf_path = 'book.pdf'  # Path to your PDF file
output_txt_path = 'output.txt'  # Path where the text will be saved

extract_text_from_pdf(pdf_path, output_txt_path)




Text extracted from book.pdf and saved to output.txt


In [32]:
def delete_text_above_line(file_path, line_number):
    # Check if the line_number is valid
    if line_number <= 0:
        raise ValueError("Line number must be greater than 0.")

    # Read the text file and get all lines
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()  # Read all lines into a list

    # Determine the lines to keep
    if line_number > len(lines):
        raise ValueError("Line number exceeds the total number of lines in the file.")

    # Retain only the lines from the specified line number onward
    new_lines = lines[line_number - 1:]

    # Write the remaining lines back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.writelines(new_lines)

    print(f"All text above line {line_number} has been deleted from {file_path}.")

file_path = 'output.txt'  # Path to your text file
line_number = 410  # Line number from where you want to keep the text

delete_text_above_line(file_path, line_number)


All text above line 410 has been deleted from output.txt.


# Definied the function for chapter name extrction and paragraph extraction along with text analysis

In [33]:
# Define the text analysis function that takes a piece of text as input
def text_analysis(text):
    # Tokenize the text into words and sentences
    word_list = word_tokenize(text)
    sent_list = sent_tokenize(text)

    # Count the number of sentences
    sent_count = len(sent_list)

    # Define a list of punctuation marks
    punct = [".", "?", "!"]

    # Remove punctuation marks from the word list
    n = len(word_list)
    word_list = [word for word in word_list if word not in punct]

    # Count the number of words after removing punctuationS
    word_count = len(word_list)

    # Perform Part-of-Speech (POS) tagging on the word list
    pos_list = nltk.pos_tag(word_list, tagset="universal")

    # Create a dictionary to store the counts of different POS tags
    pos_dict = {}
    for i in pos_list:
        if i[1] not in pos_dict:
            pos_dict[i[1]] = 1
        else:
            pos_dict[i[1]] += 1

    # Convert the POS dictionary to JSON format
    pos_json = json.dumps(pos_dict)

    # Get English stopwords from NLTK corpus
    stop_word = nltk.corpus.stopwords.words("english")

    # Count the number of stopwords in the word list
    stop_count = 0
    for i in word_list:
        if i in stop_word:
            stop_count += 1

    # Return a dictionary containing various text analysis metrics
    return {
        "Word_count": word_count,
        "Sentance_Count": sent_count,
        "Stop_Words": stop_count,
        "Post_INFO": pos_json,
    }

In [34]:
# Function to extract chapters from text
def extract_chapters(text):
    lines = text.split('\n')
    chapters = {}

    current_chapter = None
    for line in lines:
        if chapter_pattern.match(line.strip()):
            # If this line matches a chapter name, start a new chapter
            current_chapter = line.strip()
            chapters[current_chapter] = []
        elif current_chapter:
            # If there's a current chapter, add this line to its content
            chapters[current_chapter].append(line.strip())

    # Remove empty lines from each chapter
    for chapter in chapters:
        chapters[chapter] = [line for line in chapters[chapter] if line]

    return chapters

In [35]:
# Function to extract paragraphs from chapter content
def extract_paragraphs(chapter_content):
    paragraphs = []
    current_paragraph = []

    def is_paragraph_break(line):
        sentence_ending_delimiters = ['.', '!', '?']
        return any(line.strip().endswith(d) for d in sentence_ending_delimiters)

    for line in chapter_content:
        if line:
            current_paragraph.append(line)
            if is_paragraph_break(line):
                # If this is a paragraph break, join the lines to form a paragraph
                paragraphs.append(" ".join(current_paragraph))
                current_paragraph = []
        else:
            continue

    if current_paragraph:
        paragraphs.append(" ".join(current_paragraph))

    return paragraphs


# Function to generate CSV file

In [36]:
# Main function to parse the text and generate CSV data
def parse_book_structure(txt_file_path, csv_output_path):
    with open(txt_file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    chapters = extract_chapters(text)  # Extract the chapters

    data = []
    serial_number = 0

    for chapter_name, chapter_content in chapters.items():
        paragraphs = extract_paragraphs(chapter_content)  # Extract paragraphs within this chapter
        paragraph_counter = 0  # Reset the paragraph count for each chapter

        for paragraph in paragraphs:
            paragraph_counter += 1
            serial_number += 1
            para_analysis = text_analysis(paragraph)

            # Store the information in a dictionary
            data.append({
                'serial_no': serial_number,
                'chapter_name': chapter_name,
                'paragraph_number': paragraph_counter,
                'paragraph': paragraph,
                'Word_Count': para_analysis['Word_count'],
                'Sentance_Count': para_analysis['Sentance_Count'],
                'Stop_Words': para_analysis['Stop_Words'],
                'Post_INFO': para_analysis['Post_INFO']
            })

    # Create a DataFrame and write to CSV
    df = pd.DataFrame(data)
    df.to_csv(csv_output_path, index=False, quoting=csv.QUOTE_ALL)

    print(f"Data extracted and saved to {csv_output_path}")


In [37]:
# List of chapter names to identify
chapter_names = [
    "One", "Two", "Three", "Four", "Five",
    "Six", "Seven", "Eight", "Nine", "Ten",
    "Eleven", "Twelve", "Thirteen", "Fourteen", "Fifteen",
    "Sixteen", "Seventeen", "Eighteen", "Nineteen", "Twenty",
    "Twenty One", "Epilogue", "Epilogue II"
]

# Regular expression to identify specific chapter names
chapter_pattern = re.compile(r'^\s*({})\s*$'.format('|'.join(chapter_names)), re.IGNORECASE)


txt_file_path = 'output.txt'  # Path to your input text file
csv_output_path = 'Curated_Book_data_anlysis.csv'  # Path to your output CSV file

parse_book_structure(txt_file_path, csv_output_path)


Data extracted and saved to Curated_Book_data_anlysis.csv


# Data Matrices plots using Plotly

In [38]:
df = pd.read_csv("Curated_Book_data_anlysis.csv")

In [39]:
df

Unnamed: 0,serial_no,chapter_name,paragraph_number,paragraph,Word_Count,Sentance_Count,Stop_Words,Post_INFO
0,1,One,1,"India vs South Africa 4th ODI, Vadodra 17 Marc...",44,3,15,"{""NOUN"": 15, ""NUM"": 4, ""."": 3, ""ADP"": 3, ""ADV""..."
1,2,One,2,`Huh?' I said. We were in Ishaan's house Ishaa...,60,8,20,"{""."": 7, ""NOUN"": 15, ""PRON"": 5, ""VERB"": 12, ""A..."
2,3,One,3,Nobody moves for the next five overs.' I look...,70,8,23,"{""NOUN"": 22, ""ADP"": 9, ""DET"": 6, ""ADJ"": 4, ""NU..."
3,4,One,4,"'The khakra's crispy,' Omi said. Ishaan g...",38,3,10,"{""PRT"": 2, ""NOUN"": 11, ""."": 3, ""VERB"": 5, ""ADP..."
4,5,One,5,The crowd clapped as Tendulkar made his exit. ...,39,4,12,"{""DET"": 2, ""NOUN"": 14, ""VERB"": 5, ""ADP"": 5, ""P..."
...,...,...,...,...,...,...,...,...
2330,2331,Epilogue II,5,The next morning I woke up early. I had an SMS...,13,2,4,"{""DET"": 2, ""ADJ"": 2, ""NOUN"": 3, ""PRON"": 2, ""VE..."
2331,2332,Epilogue II,6,"doc approves ali 2 play, fingers X. pls pray, ...",60,5,16,"{""NOUN"": 18, ""VERB"": 10, ""ADV"": 2, ""NUM"": 6, ""..."
2332,2333,Epilogue II,7,ish bowls 2 ali.,4,1,0,"{""ADJ"": 1, ""VERB"": 1, ""NUM"": 1, ""NOUN"": 1}"
2333,2334,Epilogue II,8,ali moves fwd & turns.,5,1,0,"{""NOUN"": 3, ""VERB"": 1, ""CONJ"": 1}"


In [40]:
df.head(1)

Unnamed: 0,serial_no,chapter_name,paragraph_number,paragraph,Word_Count,Sentance_Count,Stop_Words,Post_INFO
0,1,One,1,"India vs South Africa 4th ODI, Vadodra 17 Marc...",44,3,15,"{""NOUN"": 15, ""NUM"": 4, ""."": 3, ""ADP"": 3, ""ADV""..."


In [41]:
import pandas as pd
import json

# Convert POS tag JSON strings to dictionaries
df['Post_INFO'] = df['Post_INFO'].apply(lambda x: json.loads(x))

# Display the first few rows to ensure data is loaded and parsed correctly
df.head()


Unnamed: 0,serial_no,chapter_name,paragraph_number,paragraph,Word_Count,Sentance_Count,Stop_Words,Post_INFO
0,1,One,1,"India vs South Africa 4th ODI, Vadodra 17 Marc...",44,3,15,"{'NOUN': 15, 'NUM': 4, '.': 3, 'ADP': 3, 'ADV'..."
1,2,One,2,`Huh?' I said. We were in Ishaan's house Ishaa...,60,8,20,"{'.': 7, 'NOUN': 15, 'PRON': 5, 'VERB': 12, 'A..."
2,3,One,3,Nobody moves for the next five overs.' I look...,70,8,23,"{'NOUN': 22, 'ADP': 9, 'DET': 6, 'ADJ': 4, 'NU..."
3,4,One,4,"'The khakra's crispy,' Omi said. Ishaan g...",38,3,10,"{'PRT': 2, 'NOUN': 11, '.': 3, 'VERB': 5, 'ADP..."
4,5,One,5,The crowd clapped as Tendulkar made his exit. ...,39,4,12,"{'DET': 2, 'NOUN': 14, 'VERB': 5, 'ADP': 5, 'P..."


In [42]:
import plotly.express as px
import pandas as pd


fig = px.scatter(
    df,
    x='Word_Count',
    y='Sentance_Count',
    color='chapter_name',
    title='Word Count vs. Sentence Count by Chapter',
    labels={'Word_Count': 'Word Count', 'Sentence_Count': 'Sentence Count'},
    hover_data=['paragraph_number']  # Display paragraph numbers when hovering
)

fig.update_traces(
    text=df['paragraph_number'],  # Show paragraph numbers as text
    textposition='top center'  # Position text
)

fig.show()


In [43]:
fig = px.box(df, y='Stop_Words', color='chapter_name',
             title='Distribution of Stop Word Count by Chapter')
fig.show()


In [44]:
from collections import Counter

# Aggregate all POS tags from all paragraphs
pos_counter = Counter()
for pos_dict in df['Post_INFO']:
    pos_counter.update(pos_dict.keys())

# Create a DataFrame for the bar chart
pos_df = pd.DataFrame.from_dict(pos_counter, orient='index', columns=['count'])
pos_df = pos_df.reset_index().rename(columns={'index': 'POS Tag'})

fig = px.bar(pos_df, x='POS Tag', y='count', title='POS Tag Frequency')
fig.show()


In [47]:
import plotly.express as px
import pandas as pd

# Group by chapter name and count unique paragraph numbers
paragraph_counts = df.groupby('chapter_name')['paragraph_number'].nunique().reset_index()

# Create the bar plot to show the number of paragraphs in each chapter
fig = px.bar(paragraph_counts, x='chapter_name', y='paragraph_number',
             title='Number of Paragraphs in Each Chapter',
             labels={'chapter_name': 'Chapter Name', 'paragraph_number': 'Number of Paragraphs'})
fig.show()
