In [9]:
import PyPDF2
import os
import re
from googletrans import Translator
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer


In [10]:
# Function to extract text from a single PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)
        text = ""
        for page in range(reader.numPages):
            text += reader.getPage(page).extractText()
    return text

# Function to translate text from Russian to English
def translate_text_to_english(text, source_language='ru'):
    translator = Translator()
    translation = translator.translate(text, src=source_language, dest='en')
    return translation.text

# Function to preprocess the text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # emoving extra spaces
    return text

In [11]:
# Sentiment analysis function
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    compound_score = sentiment_scores['compound']
    
    if compound_score >= 0.05:
        return "Positive", compound_score
    elif compound_score <= -0.05:
        return "Negative", compound_score
    else:
        return "Neutral", compound_score


# Placeholder for additional analysis function
def perform_additional_analysis(texts):
    # Implement additional analysis techniques
    pass

# Function to read and process all PDFs in a directory
def read_pdfs_from_directory(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            path = os.path.join(directory, filename)
            russian_text = extract_text_from_pdf(path)
            english_text = translate_text_to_english(russian_text)
            texts.append(english_text)
    return texts

In [None]:
# Main execution for sentiment analysis
directory_path = "/home//dev/dev/research/GeospatialAnalysis/library"
texts = read_pdfs_from_directory(directory_path)
processed_texts = [preprocess_text(text) for text in texts]
sentiments = [analyze_sentiment(text) for text in processed_texts]

# Perform Sentiment Analysis
for sentiment in sentiments:
    print(f"Sentiment: {sentiment[0]}, Compound Score: {sentiment[1]}")