In [1]:
# ---------------------- Imports ----------------------
import io, os, re, sys, tempfile, urllib.request, json
from datetime import datetime
import spacy
# Uninstall the old 'docx' package which causes the error and install 'python-docx' instead.
!pip uninstall -y docx || true
!pip install dateparser PyPDF2 python-docx pydub moviepy speechrecognition
import dateparser
import PyPDF2
from docx import Document # Import Document directly from the 'python-docx' module
from pydub import AudioSegment
from moviepy.editor import VideoFileClip
import speech_recognition as sr
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

[0mCollecting dateparser
  Downloading dateparser-1.2.2-py3-none-any.whl.metadata (29 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting speechrecognition
  Downloading speechrecognition-3.14.4-py3-none-any.whl.metadata (30 kB)
Downloading dateparser-1.2.2-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading speechrecognition-3.14.4-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):
  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  rotation_lines = [l for l in lines if 'rotate          :' in l and re.search('\d+$', l)]
  match = re.search('\d+$', rotation_line)
  if event.key is 'enter':



In [2]:
# Optional: Gemini client
USE_GEMINI = False
try:
    from google import genai
    api_key = os.getenv("GOOGLE_API_KEY", "")
    if api_key:
        client = genai.Client(api_key=api_key)
        USE_GEMINI = True
except:
    pass

MODEL_NAME = "gemini-2.5-flash-lite"

In [3]:
# ---------------------- NLP setup ----------------------
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 5_000_000

def clean_text(t):
    return re.sub(r"\s+", " ", t.replace("\r", " ").replace("\n", " ")).strip()

In [4]:
# ---------------------- Text extraction ----------------------
def text_from_pdf(data):
    try:
        reader = PyPDF2.PdfReader(io.BytesIO(data))
        return clean_text(" ".join([p.extract_text() or "" for p in reader.pages]))
    except:
        return ""

In [5]:
def text_from_docx(data):
    try:
        doc = docx.Document(io.BytesIO(data))
        return clean_text(" ".join([p.text for p in doc.paragraphs]))
    except:
        return ""

In [6]:
def text_from_txt(data):
    for enc in ["utf-8", "latin-1"]:
        try:
            return clean_text(data.decode(enc))
        except:
            continue
    return ""


In [7]:
def text_from_audio(data, ext):
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}")
    tmp.write(data)
    tmp.close()
    wav = tmp.name + ".wav"
    try:
        audio = AudioSegment.from_file(tmp.name)
        audio.export(wav, format="wav")
        r = sr.Recognizer()
        with sr.AudioFile(wav) as source:
            audio_data = r.record(source)
        return clean_text(r.recognize_google(audio_data))
    except:
        return ""
    finally:
        try: os.unlink(tmp.name)
        except: pass
        if os.path.exists(wav):
            try: os.unlink(wav)
            except: pass

In [8]:
def text_from_video(data, ext):
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}")
    tmp.write(data)
    tmp.close()
    wav = tmp.name + ".wav"
    try:
        clip = VideoFileClip(tmp.name)
        if clip.audio is None:
            return ""
        clip.audio.write_audiofile(wav, verbose=False, logger=None)
        with open(wav, "rb") as f:
            audio_bytes = f.read()
        return text_from_audio(audio_bytes, "wav")
    except:
        return ""
    finally:
        try: os.unlink(tmp.name)
        except: pass
        if os.path.exists(wav):
            try: os.unlink(wav)
            except: pass

def extract_text(fname, data):
    fname = fname.lower()
    if fname.endswith(".pdf"): return text_from_pdf(data)
    if fname.endswith((".docx", ".doc")): return text_from_docx(data)
    if fname.endswith(".txt"): return text_from_txt(data)
    if fname.endswith((".mp3", ".wav", ".m4a")): return text_from_audio(data, fname.split(".")[-1])
    if fname.endswith((".mp4", ".avi", ".mov")): return text_from_video(data, fname.split(".")[-1])
    return text_from_txt(data)


In [9]:
# ---------------------- Local summarizer ----------------------
def local_summarize(text, max_sentences=7):
    doc = nlp(text)
    freq = {}
    for token in doc:
        if token.is_stop or token.is_punct or not token.text.strip():
            continue
        w = token.lemma_.lower()
        freq[w] = freq.get(w, 0) + 1

    sent_scores = []
    for sent in doc.sents:
        s = sent.text.strip()
        if len(s.split()) < 3:
            continue
        score = 0
        for token in nlp(s):
            if token.is_stop or token.is_punct:
                continue
            score += freq.get(token.lemma_.lower(), 0)
        sent_scores.append((score, s))

    sent_scores.sort(key=lambda x: x[0], reverse=True)
    selected = [s for _, s in sent_scores[:max_sentences]]

    ordered = []
    for sent in doc.sents:
        t = sent.text.strip()
        if t in selected and t not in ordered:
            ordered.append(t)

    if ordered:
        return "\n".join(["- " + s for s in ordered])

    return "- " + text.strip()[:1000].replace(". ", ".\n- ")

In [10]:
# ---------------------- Gemini summarizer ----------------------
def summarize_with_optional_gemini(text, mode="professional"):

    if mode == "professional":
        rules = "Write a clean, professional, presentation-ready summary in bullet points."
    elif mode == "student":
        rules = "Write a simple, easy summary in small bullet points."
    else:
        rules = "Write a technical detailed summary."

    prompt = f"""
{rules}

INPUT TEXT:
{text}

=== SUMMARY ===
"""

    if USE_GEMINI:
        try:
            response = client.models.generate_content(model=MODEL_NAME, contents=prompt)
            out = response.text.strip()
            if out:
                return out
        except:
            pass

    return local_summarize(text, max_sentences=8)

In [11]:
# ---------------------- Deadline detection ----------------------
def detect_deadline(text):
    doc = nlp(text)
    dates = []
    for sent in doc.sents:
        try:
            d = dateparser.parse(sent.text, settings={"PREFER_DATES_FROM": "future"})
            if d:
                dates.append(d.date())
        except:
            pass
    if dates:
        return min(dates).strftime("%d %B %Y")
    return "No deadline found"

In [12]:
# ---------------------- Task extraction ----------------------
def extract_tasks(text):
    keys = ["must", "should", "need", "have to", "required", "submit",
            "upload", "prepare", "complete", "send", "do", "due", "deadline"]
    doc = nlp(text)
    tasks = []
    for sent in doc.sents:
        s = sent.text.strip()
        if any(k in s.lower() for k in keys):
            tasks.append("- " + s)
    return tasks if tasks else ["- No explicit tasks found"]

In [13]:
# ---------------------- Important points ----------------------
def important_points(text, top_k=6):
    doc = nlp(text)
    freq = {}
    for token in doc:
        if token.is_stop or token.is_punct or not token.is_alpha:
            continue
        w = token.lemma_.lower()
        freq[w] = freq.get(w, 0) + 1

    sent_scores = []
    for sent in doc.sents:
        s = sent.text.strip()
        if len(s) < 20:
            continue
        score = 0
        for token in nlp(s):
            if token.is_stop or token.is_punct or not token.is_alpha:
                continue
            score += freq.get(token.lemma_.lower(), 0)
        if any(ent.label_ in ["ORG","PRODUCT","PERSON","DATE","MONEY"] for ent in sent.ents):
            score *= 1.3
        sent_scores.append((score, s))

    sent_scores.sort(key=lambda x: x[0], reverse=True)
    points = []
    for _, s in sent_scores[:top_k]:
        short = s.split(".")[0].strip()
        if len(short) > 220:
            short = short[:217].rsplit(" ",1)[0] + "..."
        points.append("- " + short)

    return points if points else ["- No clear important points detected"]

In [14]:
# ---------------------- UI ----------------------
upload = widgets.FileUpload(accept="", multiple=False)
mode_selector = widgets.Dropdown(
    options=[("Professional (default)", "professional"),
             ("Student-friendly (simple)", "student"),
             ("Technical (detailed)", "technical")],
    value="professional",
    description="Summary Mode:"
)
run_button = widgets.Button(description="Process & Summarize", button_style="primary")
output = widgets.Output()

display(widgets.VBox([
    widgets.Label("Upload file (pdf, docx, txt, mp3, wav, mp4):"),
    upload,
    mode_selector,
    run_button,
    output
]))


def process_and_display(b):
    with output:
        clear_output()

        if not upload.value:
            print("Please upload a file first.")
            return

        fname = list(upload.value.keys())[0]
        data = upload.value[fname]["content"]

        print("Extracting text from:", fname)
        text = extract_text(fname, data)

        if not text:
            print("No text extracted.")
            return

        print("Detecting deadlines & tasks...")
        deadline = detect_deadline(text)
        tasks = extract_tasks(text)

        print("Detecting important points...")
        important = important_points(text, top_k=7)

        selected_mode = mode_selector.value
        print("Generating summary (", selected_mode, ") ...")
        summary = summarize_with_optional_gemini(text, mode=selected_mode)

        pres_points = important[:6]
        pres_summary_lines = []

        for line in summary.splitlines():
            s = line.strip()
            if not s: continue
            if len(pres_summary_lines) >= 6: break
            s = re.sub(r"^[\-\•\s]+", "", s)
            pres_summary_lines.append("• " + s)

        if not pres_summary_lines:
            pres_summary_lines = pres_points[:6]

        print("\n=== Presentation Summary ===\n")
        for l in pres_summary_lines:
            print(l)

        print("\n=== Detailed Summary ===\n")
        print(summary)

        print("\n=== Deadline ===\n", deadline)

        print("\n=== Tasks ===")
        for t in tasks:
            print(t)

        print("\n=== Important Points ===")
        for ip in important:
            print(ip)


run_button.on_click(process_and_display)

VBox(children=(Label(value='Upload file (pdf, docx, txt, mp3, wav, mp4):'), FileUpload(value={}, description='…