In [6]:
# --- Import Libraries ---
import pdfplumber
import docx
import re

# --- Extract text from PDF ---
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# --- Extract text from DOCX ---
def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

# --- Clean text ---
def clean_text(text):
    text = re.sub(r'\n+', ' ', text)  # remove extra newlines
    text = re.sub(r'\s+', ' ', text)  # remove multiple spaces
    return text.strip()

# --- Example usage ---
pdf_text = extract_text_from_pdf(r"C:\Users\Potato\anaconda_projects\sample.pdf")

pdf_text = clean_text(pdf_text)


print("PDF Extract (first 300 chars):\n", pdf_text[:300])
# now we can read and clean text from a pdf



PDF Extract (first 300 chars):
 Introduction to Data Science Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from data. Applications include: - Machine Learning - Natural Language Processing - Data Visualization Conclusion: Data science i


In [2]:
import os
print(os.getcwd())


C:\Users\Potato\anaconda_projects\5ee114ed-4213-46f1-822e-9627a2356a53


In [3]:
os.chdir(r"C:\Users\Potato\anaconda_projects")



In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import string

# Download NLTK resources (only first time)
nltk.download("punkt")
nltk.download("stopwords")

def summarize_text(text, num_sentences=3):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    
    # Tokenize words & remove stopwords/punctuation
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    words = [w for w in words if w not in stop_words and w not in string.punctuation]
    
    # Build frequency table
    freq_table = {}
    for word in words:
        freq_table[word] = freq_table.get(word, 0) + 1
    
    # Score each sentence
    sentence_scores = {}
    for sent in sentences:
        for word in word_tokenize(sent.lower()):
            if word in freq_table:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + freq_table[word]
    
    # Pick top sentences
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    
    # Join into a summary
    summary = " ".join(summary_sentences)
    return summary

# Example: Summarize the PDF text
summary = summarize_text(pdf_text, num_sentences=3)
print("Summary:\n", summary)



Summary:
 Introduction to Data Science Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from data. Applications include: - Machine Learning - Natural Language Processing - Data Visualization Conclusion: Data science is a growing field with high demand in the job market.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Potato\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Potato\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
