In [None]:
# Install required libraries
!pip install PyPDF2
!pip install transformers
!pip install openpyxl

# Step 1: Import libraries
from transformers import pipeline
from PyPDF2 import PdfReader
import pandas as pd
import re


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
# Step 2: Define a simple sentence tokenizer
def simple_sent_tokenize(text):
    sentences = re.split(r'[.!?]', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

# Step 3: Extract text from a PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [None]:
# Step 4: Upload PDF files
from google.colab import files
print("Upload Mrs. Dalloway PDF:")
mrs_dalloway_file = files.upload()
mrs_dalloway_path = list(mrs_dalloway_file.keys())[0]



Upload Mrs. Dalloway PDF:


Saving White_Teeth.pdf to White_Teeth.pdf


In [None]:
# Step 5: Extract text from PDFs
mrs_dalloway_text = extract_text_from_pdf(mrs_dalloway_path)


# Step 6: Initialize sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [None]:
# Step 7: Define custom sentiment analysis function with text cleaning
def clean_text(text):
    # Remove illegal characters (e.g., ASCII control characters)
    return re.sub(r'[^\x20-\x7E]', '', text)

def custom_sentiment_analysis(text):
    sentences = simple_sent_tokenize(text)
    results = []
    for sentence in sentences[:500]:  # Limit to 500 sentences for performance
        clean_sentence = clean_text(sentence)  # Clean the sentence
        sentiment = sentiment_pipeline(clean_sentence)
        label = sentiment[0]["label"]
        score = sentiment[0]["score"]
        negative = score if "NEGATIVE" in label else 0  # Approximation for negative sentiment
        positive = score if "POSITIVE" in label else 0  # Approximation for positive sentiment
        results.append({
            "Sentence": clean_sentence,
            "Negative Sentiment": negative,
            "Positive Sentiment": positive
        })
    return results

# Step 8: Perform sentiment analysis
mrs_dalloway_analysis = custom_sentiment_analysis(mrs_dalloway_text)

# Step 9: Convert to DataFrames
mrs_dalloway_df = pd.DataFrame(mrs_dalloway_analysis)


# Step 10: Export detailed analysis to Excel
with pd.ExcelWriter("Sentiment_Analysis_MrsDalloway_WhiteTeeth.xlsx", engine="openpyxl") as writer:
    mrs_dalloway_df.to_excel(writer, sheet_name="Mrs. Dalloway Analysis", index=False)

print("Analysis results have been saved to Sentiment_Analysis_MrsDalloway_WhiteTeeth.xlsx.")




Analysis results have been saved to Sentiment_Analysis_MrsDalloway_WhiteTeeth.xlsx.
