## Installs

In [None]:
pip install torch transformers matplotlib evaluate

In [None]:
pip install langchain-cohere

In [None]:
pip install pypdf2 pdfplumber pymupdf


In [None]:
pip install requests beautifulsoup4

In [None]:
pip install python-docx


## Data.py


This app takes word, pdf, url and txt file to make a new document with simplified language

In [None]:
from PyPDF2 import PdfReader
import pdfplumber
from docx import Document
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF

In [None]:
def open_doc(doc_path):
  list_paragraphs = []
  # Load the Word document
  doc = Document(doc_path)

  # Extract and print all the text
  for paragraph in doc.paragraphs:
      print(paragraph.text)
      list_paragraphs.append(paragraph.text)
  # Extract tables
  for table in doc.tables:
      for row in table.rows:
          for cell in row.cells:
              print(cell.text)

  # Extract headings or styled text
  for paragraph in doc.paragraphs:
      if paragraph.style.name.startswith('Heading'):
          print(f"Heading: {paragraph.text}")
      else:
          print(f"Paragraph: {paragraph.text}")
  return list_paragraphs


In [None]:
def open_doc_dict(doc_path):
    """
    Opens a Word document, extracts headings and their corresponding paragraphs,
    and returns a dictionary of headings and paragraphs.

    Args:
        doc_path (str): Path to the Word document.

    Returns:
        dict: A dictionary where keys are headings and values are lists of paragraphs.
    """
    # Load the Word document
    doc = Document(doc_path)

    # Dictionary to store headings and paragraphs
    heading_dict = {}
    current_heading = None

    # Iterate through paragraphs
    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            # Start a new heading
            current_heading = paragraph.text.strip()
            if current_heading:  # Avoid empty headings
                heading_dict[current_heading] = []  # Initialize list for its paragraphs
        else:
            # Add paragraph to the current heading
            if current_heading:
                text = paragraph.text.strip()
                if text:  # Avoid adding empty paragraphs
                    heading_dict[current_heading].append(text)

    return heading_dict


In [None]:
def open_md_dict(md_path):
    """
    Opens a Markdown (.md) file, extracts headings and their corresponding paragraphs,
    and returns a dictionary of headings and paragraphs.

    Args:
        doc_path (str): Path to the Markdown file.

    Returns:
        dict: A dictionary where keys are headings and values are lists of paragraphs.
    """
    # Dictionary to store headings and paragraphs
    heading_dict = {}
    current_heading = None

    # Open and read the Markdown file
    with open(md_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading and trailing whitespace
            if line.startswith('#'):  # Detect Markdown headings
                # Determine heading level based on number of '#' characters
                current_heading = line.lstrip('#').strip()
                if current_heading:  # Avoid empty headings
                    heading_dict[current_heading] = []  # Initialize list for its paragraphs
            elif current_heading:
                # Add non-empty lines as paragraphs to the current heading
                if line:  # Avoid adding empty lines
                    heading_dict[current_heading].append(line)

    return heading_dict


In [None]:
def open_pdf_dict(doc_path):
    """
    Opens a PDF document, extracts headings and their corresponding paragraphs,
    and returns a dictionary of headings and paragraphs.

    Args:
        doc_path (str): Path to the PDF document.

    Returns:
        dict: A dictionary where keys are headings and values are lists of paragraphs.
    """
    # Open the PDF document
    pdf_document = fitz.open(doc_path)

    # Dictionary to store headings and paragraphs
    heading_dict = {}
    current_heading = None

    # Iterate through pages in the PDF
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:  # Ensure block has text lines
                for line in block["lines"]:
                    line_text = " ".join(span["text"] for span in line["spans"]).strip()

                    # Simple heuristic: Check if text is a heading based on size or formatting
                    if any(span.get("flags", 0) & 2 for span in line["spans"]) or line_text.isupper():
                        # Treat as heading
                        current_heading = line_text
                        heading_dict[current_heading] = []
                    else:
                        # Add paragraph to the current heading
                        if current_heading:
                            if line_text:  # Avoid adding empty lines
                                heading_dict[current_heading].append(line_text)

    pdf_document.close()
    return heading_dict


## simple language

In [None]:
import json
import ast
from tqdm import tqdm
import re
from llama_cpp import Llama

In [None]:
import cohere

In [None]:
def chat_completion_cohere(text, API_KEY, model, system_prompt):
  co = cohere.Client(API_KEY)
  prompt = f"""<<SYS>>
  {system_prompt}
  <</SYS>>
  You will receive paragraphs of text. Rewrite them in a simple language. Use short sentences and do not use difficult words and grammare:{text}"""
  response = co.generate(  # Use generate instead of chat
    model=model,
    prompt=prompt,  # Provide the prompt directly # Adjust as needed
  )
  print(response)
  simplified_text = response.generations[0].text
  print(simplified_text)
  return simplified_text

## text processing

In [None]:
from docx import Document

In [None]:
from pathlib import Path

In [None]:
def make_output(doc):
  # Add a new paragraph
  doc.add_paragraph("This is a new paragraph added to the document.")
  # Save the modified document
  doc.save("modified_file.docx")

In [None]:
def divide_into_paragraphs(text):
    """
    Divides the given text into paragraphs.

    Args:
        text (str): The input text.

    Returns:
        list: A list of paragraphs.
    """
    # Split the text by newlines and remove empty paragraphs
    paragraphs = [para.strip() for para in text.split('\n') if para.strip()]
    return paragraphs




In [None]:
def simplify_text_cohere(dict_headings, API_KEY, model, system_prompt):
    """
    Simplifies a document into a dictionary with headings as keys and simplified content as values.

    Args:
        dict_headings (dict): A dictionary of headings and their associated content.
        llm_Qwen_small: Language model to use for simplification.
        system_prompt (str): System prompt for the language model.
        user_prompt (str): User prompt for the language model.
        top_p (float): Top-p sampling parameter.
        temperature (float): Temperature parameter.
        top_k (int): Top-k sampling parameter.

    Returns:
        dict: A dictionary with simplified content.
    """
    # Initialize the simplified_dict with existing headings as keys
    simplified_dict = {heading: [] for heading in dict_headings}

    for heading, para in dict_headings.items():
        simplified_para = chat_completion_cohere(para, API_KEY, model, system_prompt)
        print(simplified_para)
        # Add the simplified paragraph to the dictionary
        simplified_dict[heading].append(simplified_para)
    return simplified_dict

In [None]:
def rebuild_text_from_dict(simplified_dict):
    """
    Rebuilds text from a dictionary of headings and simplified paragraphs.

    Args:
        simplified_dict (dict): Dictionary of headings and their simplified paragraphs.

    Returns:
        str: Reconstructed text with headings followed by simplified paragraphs.
    """
    rebuilt_text = []

    for heading, paragraphs in simplified_dict.items():
        # Add the heading
        rebuilt_text.append(heading)

        # Add the simplified paragraphs under the heading
        rebuilt_text.extend(paragraphs)

        # Add a blank line to separate sections
        rebuilt_text.append("")

    # Join all parts with a newline and return
    return "\n".join(rebuilt_text)


In [None]:
def write_dict_to_docx(simplified_dict, output_path):
    """
    Writes a dictionary of headings and paragraphs into a Word document.

    Args:
        simplified_dict (dict): Dictionary with headings as keys and paragraphs as values.
        output_path (str): Path to save the output Word document.

    Returns:
        None
    """
    # Create a new Word document
    doc = Document()

    for heading, paragraphs in simplified_dict.items():
        # Add heading
        doc.add_heading(heading, level=1)  # Level 1 heading

        # Add paragraphs under the heading
        for paragraph in paragraphs:
            doc.add_paragraph(paragraph)

        # Add a blank line for readability (optional)
        doc.add_paragraph("")

    # Save the document
    doc.save(output_path)
    print(f"Document saved to: {output_path}")

In [None]:
def write_dict_to_md(simplified_dict, output_path):
    """
    Writes a dictionary of headings and paragraphs into a Markdown (.md) file.

    Args:
        simplified_dict (dict): Dictionary with headings as keys and paragraphs as values.
        output_path (str): Path to save the output Markdown file.

    Returns:
        None
    """
    with open(output_path, 'w', encoding='utf-8') as md_file:
        for heading, paragraphs in simplified_dict.items():
            # Write heading in Markdown format (## for level 1 heading)
            md_file.write(f"# {heading}\n\n")

            # Write paragraphs under the heading
            for paragraph in paragraphs:
                md_file.write(f"{paragraph}\n\n")  # Separate paragraphs with a blank line

        print(f"Markdown file saved to: {output_path}")


In [None]:
def get_name_file(file_path):
  # Extract the file name without extension
  file_name_no_ext = Path(file_path).stem
  return file_name_no_ext



## main

In [None]:
pdf_path = '/content/60489-Article Text-189083-1-10-20191001.pdf'

In [None]:
doc_path = '/content/Climate Change_ Understanding the Challenge of Our Era.docx'

In [None]:
dict_file = open_doc_dict(doc_path)

In [None]:
dict_file_pdf = open_pdf_dict(pdf_path)
dict_file_pdf # something is wrong here

In [None]:
dict_heading_md = open_md_dict("/content/climate.md")

In [None]:
system_prompt = "You are a text simplification expert. Your task is to rewrite the text in a simple language."

In [None]:
top_p, temperature, top_k = [0.2, 0.6, 50]

In [None]:
API_KEY = ""

In [None]:
cohere_model =  "command-r-plus-08-2024"

In [None]:
dict_headings = open_doc_dict(doc_path)

In [None]:
dict_headings

In [None]:
dict_headings_pdf = open_pdf_dict(pdf_path)

In [None]:
dict_headings_pdf

In [None]:
simplified_dict_cohere = simplify_text_cohere(dict_headings, API_KEY, cohere_model, system_prompt)

In [None]:
write_dict_to_docx(simplified_dict_cohere, "/content/climate.docx")

Document saved to: /content/climate.docx


In [None]:
write_dict_to_md(simplified_dict_cohere, "/content/climate.md")

Markdown file saved to: /content/climate.md


In [None]:
name_file = get_name_file(doc_path)
output_path = f"{name_file}_simplified.docx"

In [None]:
name_file_md = get_name_file("/content/climate.md")
output_path_md = f"{name_file_md}_simplified.md"

In [None]:
write_dict_to_md(simplified_dict_md, output_path_md)

Markdown file saved to: climate_simplified.md


In [None]:
write_dict_to_docx(simplified_dict, output_path)

Document saved to: Climate Change_ Understanding the Challenge of Our Era_simplified.docx
