<a href="https://colab.research.google.com/github/Biswajit0408/FlashCard/blob/main/backend_v_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q PyMuPDF transformers sentence-transformers nltk pandas spacy rapidfuzz
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Imports and small downloads
import os
import fitz            # PyMuPDF
import nltk
import re
import pandas as pd
import torch
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer
from rapidfuzz import fuzz

nltk.download('punkt')
import spacy
nlp = spacy.load("en_core_web_sm")

# Device helper
DEVICE = 0 if torch.cuda.is_available() else -1
print("Using torch.cuda:", torch.cuda.is_available(), "DEVICE for pipelines:", DEVICE)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using torch.cuda: False DEVICE for pipelines: -1


In [None]:
# Upload file dialog (Colab)
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
print("Uploaded:", file_name)


Saving Digital Payments.pdf to Digital Payments.pdf
Uploaded: Digital Payments.pdf


In [None]:
# Extract text (PDF or plain text)
def extract_text(path):
    path = str(path)
    if path.lower().endswith(".pdf"):
        doc = fitz.open(path)
        pages = []
        for p in doc:
            pages.append(p.get_text().strip())
        return "\n\n".join(pages)
    else:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()

raw_text = extract_text(file_name)
print("Extracted characters:", len(raw_text))
print(raw_text[:800].replace("\n", " ") + "...\n")


Extracted characters: 2643
Digital payments in India have reshaped the country’s financial landscape, bringing a major shift  from traditional cash-based transactions to fast, secure, and convenient digital methods. This  transformation began gaining momentum with the government’s Digital India initiative and  accelerated significantly after the 2016 demonetization, when people were encouraged to adopt  electronic modes of payment. Today, India stands as one of the world’s largest digital payment  ecosystems, with platforms like UPI (Unified Payments Interface), mobile wallets, internet  banking, and QR-code payments becoming an integral part of daily life.  UPI, in particular, has revolutionized how Indians exchange money. Its real-time, bank-to-bank  transfer system is simple, free, and accessible to everyone with...



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def top_k_sentences(text, k=15):
    sents = sent_tokenize(text)
    if len(sents) <= k:
        return sents
    vec = TfidfVectorizer().fit_transform(sents)
    scores = vec.sum(axis=1).A1
    top_idx = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)[:k]
    return [sents[i] for i in top_idx]

sentences = sent_tokenize(raw_text)
print("Total sentences:", len(sentences))

# Choose top K important sentences to generate cards from (adjust k as needed)
important_sentences = top_k_sentences(raw_text, k=20)
len(important_sentences), important_sentences[:3]


Total sentences: 19


(19,
 ['Digital payments in India have reshaped the country’s financial landscape, bringing a major shift \nfrom traditional cash-based transactions to fast, secure, and convenient digital methods.',
  'This \ntransformation began gaining momentum with the government’s Digital India initiative and \naccelerated significantly after the 2016 demonetization, when people were encouraged to adopt \nelectronic modes of payment.',
  'Today, India stands as one of the world’s largest digital payment \necosystems, with platforms like UPI (Unified Payments Interface), mobile wallets, internet \nbanking, and QR-code payments becoming an integral part of daily life.'])

In [None]:
QG_MODEL = "valhalla/t5-base-qg-hl"
qg_tok = AutoTokenizer.from_pretrained(QG_MODEL)
qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_MODEL)

QA_MODEL = "distilbert-base-cased-distilled-squad"
qa_pipeline = pipeline("question-answering", model=QA_MODEL, tokenizer=QA_MODEL)

embedder = SentenceTransformer("all-MiniLM-L6-v2")


Device set to use cpu


In [None]:
def extract_answer_candidates(sentence, max_candidates=6):
    doc = nlp(sentence)
    candidates = []
    for ent in doc.ents:
        txt = ent.text.strip()
        if txt and txt not in candidates:
            candidates.append(txt)
    noun_chunks = sorted({chunk.text.strip() for chunk in doc.noun_chunks if chunk.text.strip()}, key=len, reverse=True)
    for nc in noun_chunks:
        if nc not in candidates:
            candidates.append(nc)
    proper_nouns = " ".join([tok.text for tok in doc if tok.pos_ == "PROPN"])
    if proper_nouns and proper_nouns not in candidates:
        candidates.append(proper_nouns)
    tokens = [t.text for t in doc if not t.is_space]
    for L in range(min(6, len(tokens)), 0, -1):
        for i in range(len(tokens)-L+1):
            span = " ".join(tokens[i:i+L]).strip()
            if span not in candidates:
                candidates.append(span)
            if len(candidates) >= max_candidates:
                break
        if len(candidates) >= max_candidates:
            break
    return candidates[:max_candidates]


In [None]:
def format_with_hl(context_sentence, answer_text):
    idx = context_sentence.find(answer_text)
    if idx != -1:
        before = context_sentence[:idx]
        after = context_sentence[idx+len(answer_text):]
        return f"{before}<hl> {answer_text} <hl>{after}"
    return f"<hl> {answer_text} <hl> " + context_sentence


In [None]:
def normalize_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", "", s)
    return " ".join(s.split())

def f1_score(prediction, ground_truth):
    pred_tokens = normalize_text(prediction).split()
    gt_tokens = normalize_text(ground_truth).split()
    if not pred_tokens or not gt_tokens:
        return 0.0
    common = len(set(pred_tokens) & set(gt_tokens))
    if common == 0:
        return 0.0
    precision = common / len(pred_tokens)
    recall = common / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)


In [None]:
def generate_questions_for_answer(sentence, answer, num_return=3, max_len=64):
    highlighted = format_with_hl(sentence, answer)
    input_text = "generate question: " + highlighted
    inputs = qg_tok.encode(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = qg_model.generate(
        inputs,
        max_length=max_len,
        num_beams=6,
        num_return_sequences=num_return,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    qs = [qg_tok.decode(o, skip_special_tokens=True) for o in outputs]
    seen = set()
    out = []
    for q in qs:
        if q not in seen:
            seen.add(q)
            out.append(q)
    return out


In [None]:
def find_sentence_with_span(span, context):
    if not span or not context:
        return ""
    sents = sent_tokenize(context)
    for s in sents:
        if span.lower() in s.lower():
            return s.strip()
    best = None
    best_score = -1
    for s in sents:
        score = fuzz.partial_ratio(span.lower(), s.lower())
        if score > best_score:
            best = s
            best_score = score
    return best.strip() if best else ""

def verify_question_with_qa_expanded(question, context, expected_answer):
    try:
        res = qa_pipeline(question=question, context=context)
        pred_span = res.get("answer", "").strip()
        f1 = f1_score(pred_span, expected_answer)
        fuzz_ratio = fuzz.partial_ratio(pred_span.lower(), expected_answer.lower())/100.0
        pred_full = find_sentence_with_span(pred_span, context)
        if not pred_full:
            pred_full = sent_tokenize(context)[0]
        if pred_full and pred_full[-1] not in ".!?":
            pred_full += "."
        return {
            "predicted_span": pred_span,
            "predicted_full": pred_full,
            "f1": f1,
            "fuzz": fuzz_ratio
        }
    except:
        return {"predicted_span":"", "predicted_full":"", "f1":0.0, "fuzz":0.0}


In [None]:
def generate_verified_question_with_context(sentence, context_window=None):
    context_for_qa = context_window if context_window else sentence
    candidates = extract_answer_candidates(sentence, max_candidates=6)

    best = None
    best_score = -1
    best_meta = None

    for cand in candidates:
        qs = generate_questions_for_answer(sentence, cand)
        for q in qs:
            v = verify_question_with_qa_expanded(q, context_for_qa, cand)
            combined = v["f1"]*0.8 + v["fuzz"]*0.2
            if combined > best_score:
                best_score = combined
                best = q
                best_meta = {"candidate": cand, "verify": v}
        if best_score >= 0.85:
            break

    if best_score >= 0.45 and best_meta:
        full_answer = best_meta["verify"]["predicted_full"]
        short_context = context_for_qa[:250] + "..." if len(context_for_qa) > 250 else context_for_qa
        return {
            "question": best,
            "answer": full_answer,
            "short_context": short_context,
            "method": "qg_verified_expanded",
            "verify_metrics": best_meta["verify"],
            "score": best_score
        }

    fallback = candidates[0] if candidates else sentence.split()[0]
    full_sentence = find_sentence_with_span(fallback, context_for_qa)
    if not full_sentence:
        full_sentence = sentence
    cloze = sentence.replace(fallback, "_____")
    short_context = sentence if len(sentence)<250 else sentence[:240]+"..."
    return {
        "question": cloze,
        "answer": full_sentence,
        "short_context": short_context,
        "method": "cloze_fallback",
        "score": best_score
    }


In [None]:
flashcards = []
for i, sent in enumerate(important_sentences):
    ctx_start = max(0, i-1)
    ctx_end = min(len(sentences), i+2)
    context_window = " ".join(sentences[ctx_start:ctx_end])
    card = generate_verified_question_with_context(sent, context_window=context_window)
    card["context"] = sent
    flashcards.append(card)

df = pd.DataFrame(flashcards)
df.head(20)


Unnamed: 0,question,answer,short_context,method,verify_metrics,score,context
0,What type of payments have reshaped India's fi...,Digital payments in India have reshaped the co...,Digital payments in India have reshaped the co...,qg_verified_expanded,"{'predicted_span': 'Digital', 'predicted_full'...",1.0,Digital payments in India have reshaped the co...
1,What was the name of the government's initiati...,This \ntransformation began gaining momentum w...,Digital payments in India have reshaped the co...,qg_verified_expanded,"{'predicted_span': 'Digital India', 'predicted...",1.0,This \ntransformation began gaining momentum w...
2,What country has one of the world's largest di...,This \ntransformation began gaining momentum w...,This \ntransformation began gaining momentum w...,qg_verified_expanded,"{'predicted_span': 'India', 'predicted_full': ...",1.0,"Today, India stands as one of the world’s larg..."
3,What has revolutionized how Indians exchange m...,"Today, India stands as one of the world’s larg...","Today, India stands as one of the world’s larg...",qg_verified_expanded,"{'predicted_span': 'UPI', 'predicted_full': 'T...",1.0,"UPI, in particular, has revolutionized how Ind..."
4,"Besides income levels, what other groups use t...","Its real-time, bank-to-bank \ntransfer system ...","UPI, in particular, has revolutionized how Ind...",qg_verified_expanded,"{'predicted_span': 'all age groups', 'predicte...",1.0,"Its real-time, bank-to-bank \ntransfer system ..."
5,"What do street vendors, small shopkeepers, cab...","Street vendors, small shopkeepers, cab drivers...","Its real-time, bank-to-bank \ntransfer system ...",qg_verified_expanded,"{'predicted_span': 'UPI', 'predicted_full': 'S...",1.0,"Street vendors, small shopkeepers, cab drivers..."
6,What has this widespread adoption contributed to?,This widespread adoption has helped \nbridge t...,"Street vendors, small shopkeepers, cab drivers...",qg_verified_expanded,{'predicted_span': 'greater financial inclusi...,1.0,This widespread adoption has helped \nbridge t...
7,"Along with payment apps, what has made it easi...",Mobile wallets and payment apps have also made...,This widespread adoption has helped \nbridge t...,qg_verified_expanded,"{'predicted_span': 'Mobile wallets', 'predicte...",1.0,Mobile wallets and payment apps have also made...
8,The rise of digital payments has brought what ...,The rise of digital payments has brought sever...,Mobile wallets and payment apps have also made...,qg_verified_expanded,"{'predicted_span': 'several benefits', 'predic...",1.0,The rise of digital payments has brought sever...
9,What did the IRS create to improve tax complia...,"It has \nincreased transparency, reduced corru...",The rise of digital payments has brought sever...,qg_verified_expanded,"{'predicted_span': 'a clean digital trail', '...",1.0,"It has \nincreased transparency, reduced corru..."


In [None]:
# Save and download CSV of flashcards (question,answer,context,method,score)
out_fn = "flashcards_export.csv"
df.to_csv(out_fn, index=False)
from google.colab import files
files.download(out_fn)
print("Saved:", out_fn)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved: flashcards_export.csv
