<a href="https://colab.research.google.com/github/Diya2004f/2223085/blob/main/ContractLensAI-Backend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Create app.py file
app_code = """# ----------------------------
# ContractLens AI - Legal Document Summarizer Backend
# ----------------------------

from flask import Flask, request, jsonify
from transformers import pipeline
from werkzeug.utils import secure_filename
from flask_cors import CORS
import docx
from pdf2image import convert_from_bytes
import pytesseract
import io

# ----------------------------
# Flask app setup
# ----------------------------
app = Flask(__name__)
CORS(app)  # Enable CORS for all origins

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# ----------------------------
# Config
# ----------------------------
MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
MAX_WORDS = 1500  # truncate long documents

# ----------------------------
# Helper functions
# ----------------------------
def chunk_text(text, chunk_size=700):
    '''Split long text into smaller chunks.'''
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])

def extract_text_from_docx(file):
    '''Extract text from .docx file.'''
    doc = docx.Document(file)
    return " ".join([p.text for p in doc.paragraphs])

def extract_text_from_pdf(file):
    '''Extract text from PDF file using OCR.'''
    try:
        pdf_bytes = file.read()
        images = convert_from_bytes(pdf_bytes)
        text = ""
        for img in images:
            text += pytesseract.image_to_string(img) + " "
        return text
    except Exception as e:
        print("PDF extraction error:", e)
        return ""

# ----------------------------
# API Route: Summarize document
# ----------------------------
@app.route("/summarize", methods=["POST"])
def summarize():
    text = ""
    plain_english = False

    # Handle uploaded files
    if 'file' in request.files:
        file = request.files['file']
        filename = secure_filename(file.filename)

        if filename.endswith(".pdf"):
            text = extract_text_from_pdf(file)
        elif filename.endswith(".docx"):
            text = extract_text_from_docx(file)
        elif filename.endswith(".txt"):
            text = file.read().decode("utf-8")

        plain_english = request.form.get("plain_english", "false").lower() == "true"

    # Handle raw JSON requests
    elif request.is_json:
        data = request.get_json()
        text = data.get("document_text", "")
        plain_english = data.get("plain_english", False)

    else:
        return jsonify({"error": "Invalid request format"}), 400

    if not text.strip():
        return jsonify({"error": "No text found"}), 400

    # Prefix mode for clarity
    prefix = "Summarize this legal document clearly and professionally: "
    if plain_english:
        prefix = "Summarize this legal document in simple, plain English: "
    text = prefix + text

    # Summarize in chunks
    words = text.split()
    chunks = list(chunk_text(text, chunk_size=700)) if len(words) > 700 else [text]
    summaries = []
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=200, min_length=50, truncation=True)
            summaries.append(summary[0]["summary_text"])
        except Exception as e:
            print("Summarization error:", e)
            continue

    if not summaries:
        return jsonify({"error": "Failed to summarize"}), 500

    final_summary = " ".join(summaries)

    return jsonify({
        "summary": final_summary,
        "key_clauses": ["Confidentiality", "Termination", "Payment Terms"],
        "mode": "Plain English" if plain_english else "Professional"
    })

# ----------------------------
# Root Route
# ----------------------------
@app.route("/", methods=["GET"])
def home():
    return jsonify({"message": "ContractLens AI Backend is running!"})

# ----------------------------
# Run App
# ----------------------------
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=10000)
"""
with open("app.py", "w") as f:
    f.write(app_code)

# Create requirements.txt file
reqs = """flask
flask-cors
transformers
torch
python-docx
pdf2image
pytesseract
werkzeug"""
with open("requirements.txt", "w") as f:
    f.write(reqs)

In [8]:
!ls


app.py	requirements.txt  sample_data


In [5]:
%pip install -r requirements.txt

Collecting flask-cors (from -r requirements.txt (line 2))
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting python-docx (from -r requirements.txt (line 5))
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pdf2image (from -r requirements.txt (line 6))
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract (from -r requirements.txt (line 7))
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: python-docx, pytesseract, pdf2image, flask-cors
Successfully installed flask-cors-6.0.1 pdf2image-1.17.0 pytesseract-0.3.13 

In [7]:
%run app.py

Device set to use cpu


 * Serving Flask app 'app'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:10000
 * Running on http://172.28.0.12:10000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
