# Using Stop Words

In [1]:
import pandas as pd
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

len(STOP_WORDS)

326

In [2]:
nlp = spacy.load("en_core_web_sm")
with open('Ch1-LEC-1_Introduction.txt', "r") as file:
    text = file.read()

# Calculate and print the length of the original text
original_text_length = len(text)
print("Original Text Length:", original_text_length)

Original Text Length: 16015


In [3]:
def preprocess(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Remove stop words
    no_stop_words = [token.text for token in doc if not token.is_stop]

    # Join the tokens back into a string
    preprocessed_text = " ".join(no_stop_words)

    return preprocessed_text

# Preprocess the text
preprocessed_text = preprocess(text)

# Calculate and print the length of the preprocessed text
preprocessed_text_length = len(preprocessed_text)
print("Preprocessed Text Length:", preprocessed_text_length)

Preprocessed Text Length: 13926


# Using Regex

In [4]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.0.7-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: reportlab
Successfully installed reportlab-4.0.7


In [5]:
import re
from reportlab.pdfgen import canvas

def clean_and_minimize(text):
    # Define a regex pattern to remove non-alphanumeric characters and extra spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra spaces
    return cleaned_text

# Read the original text
with open('Ch1-LEC-1_Introduction.txt', "r") as file:
    original_text = file.read()

# Calculate and print the length of the original text
original_text_length = len(original_text)
print("Original Text Length:", original_text_length)

# Clean and minimize the text using regex
cleaned_text = clean_and_minimize(original_text)

# Calculate and print the length of the cleaned text
cleaned_text_length = len(cleaned_text)
print("Cleaned Text Length:", cleaned_text_length)

# Print the cleaned text
print("Cleaned Text:")
print(cleaned_text)

# Save cleaned text to a PDF using reportlab with multiple lines
pdf_filename = 'cleaned_txt.pdf'
with open(pdf_filename, 'wb') as pdf_file:
    pdf = canvas.Canvas(pdf_file)
    pdf.setFont("Helvetica", 12)

    # Split the cleaned text into lines
    lines = cleaned_text.split('\n')

    # Add each line to the PDF
    for i, line in enumerate(lines):
        y_position = 792 - 72 - (i * 12)  # Adjust the y_position for each line
        pdf.drawString(72, y_position, line)

    pdf.save()

print(f"PDF saved to {pdf_filename}")


Original Text Length: 16015
Cleaned Text Length: 12022
Cleaned Text:
1 SOFTWARE ENGINEERING by Ian Sommerville 10th Edition Pearson 2015 2 Chapter 1 Introduction Lecture 1 3 Topics covered Professional software development What is meant by software engineering Software engineering ethics A brief introduction to ethical issues that affect software engineering Case studies An introduction to three examples that are used in later chapters in the book Chapter 1 Introduction 3 4 Software engineering The economies of all developed nations are dependent on software More and more systems are software controlled Software engineering is concerned with theories methods and tools for professional software development Software engineering involves wider responsibilities than simply the application of technical skills 5 Software engineering Software engineering is an engineering discipline that is concerned with all aspects of software production from the early stages of system specification through

In [6]:
from transformers import pipeline

# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")

# Read the text from the file
file_path = 'cleaned_txt.txt'
with open(file_path, 'r') as file:
    input_text = file.read()

# Split the text into chunks (you can adjust the chunk size)
chunk_size = 1000
chunks = [input_text[i:i+chunk_size] for i in range(0, len(input_text), chunk_size)]

# Generate summaries for each chunk
summaries = []
for chunk in chunks:
    summary = summarizer(chunk, max_length=512, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summaries.append(summary[0]['summary_text'])

# Combine the summaries into a final summary
final_summary = " ".join(summaries)

# Print the original text and final summary
print("\nGenerated Summary:")
print(final_summary)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Your max_length is set to 512, but your input_length is only 155. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=77)
Your max_length is set to 512, but your input_length is only 146. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)
Your max_length is set to 512, but your input_length is only 145. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=72)
Your max_length is set to 512, but your input_length is only 159. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)



Generated Summary:
Software engineering is concerned with theories methods and tools for professional software development. Software engineering involves wider responsibilities than simply the application of technical skills. The economies of all developed nations are dependent on software. Software engineering is an engineering discipline that is concerned with all aspects of software production. Good software should deliver the required functionality and performance to the user and should be maintainable dependable and usable. Software engineering is concerned with the practicalities of developing and delivering useful software. Software engineering is part of the more general process of computerbased systems development. All software projects have to be professionally managed and developed different techniques are appropriate for different types of system. The web has led to the availability of software services and the possibility of developing highly distributed servicebased syst

In [7]:
txt = "Software engineering is concerned with theories methods and tools for professional software development. Software engineering involves wider responsibilities than simply the application of technical skills. The economies of all developed nations are dependent on software. Software engineering is an engineering discipline that is concerned with all aspects of software production. Good software should deliver the required functionality and performance to the user and should be maintainable dependable and usable. Software engineering is concerned with the practicalities of developing and delivering useful software. Software engineering is part of the more general process of computerbased systems development. All software projects have to be professionally managed and developed different techniques are appropriate for different types of system. The web has led to the availability of software services and the possibility of developing highly distributed servicebased systems. Software costs often dominate computer system costs. Software engineering is concerned with cost effective software development. Software costs more to maintain than it does to develop for systems with a long life maintenance costs may be several times development costs. Maintainability Software should be written in such a way so that it can evolve to meet the changing needs of customers. Dependable software should not cause physical or economic damage in the event of system failure. Software must be acceptable to the type of users for which it is designed. Malicious users should not be able to access or damage the system. Software should not make wasteful use of system resources. There are many different types of software system and there is no universal set of software techniques that is applicable to all of these. The software engineering methods and tools used depend on the type of application being developed and the requirements of the customer. PCs include all necessary functionality and do not need to be connected to a network. Interactive transactionbased applications execute on a remote computer and are accessed by users from their own PCs or terminals. Embedded control systems are software control systems that control hardware devices. Software engineering fundamentals apply to all types of software system irrespective of the development techniques used. Systems should be developed using a managed and understood development process. Dependability and performance are important for all type of system. Web services allow application functionality to be accessed over the web. Cloud computing is an approach to the provision of computer services where applications run remotely on the cloud. The fundamental ideas of software engineering discussed in the previous section apply to webbased software in the same way that they apply to other types of software system. Software engineering is an engineering discipline that is concerned with all aspects of software production. 25. apter 1 Introduction 25  apter 2 Introduction 25  apter 3 Introduction 25 apter 4 Introduction 25 25  Apter 2"
print(len(txt))

3105
