In [None]:
!pip install PyMuPDF transformers sentence-transformers nltk pandas




In [None]:
import fitz
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import files

uploaded = files.upload()
file_name = list(uploaded.keys())[0]
file_name


Saving Biswajit__Resume.pdf to Biswajit__Resume (5).pdf


'Biswajit__Resume (5).pdf'

In [None]:
def extract_text(file):
    if file.endswith(".pdf"):
        doc = fitz.open(file)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    else:
        return open(file, "r", encoding="utf-8", errors="ignore").read()

raw_text = extract_text(file_name)
raw_text[:1000]


'Passionate Computer Science student with skills in Python, cloud computing, and software testing. Creative\ndesigner as an extracurricular, with a proactive and adaptable approach to problem-solving. Eager to\ncontribute to projects that merge programming, testing, and design.\nPneumonia Detection using Explainable AI\nDeveloped a CNN-based model for pneumonia detection from CT scans with high accuracy. Applied SHAP\nand LIME for interpretability, showcasing skills in Python, TensorFlow, and healthcare-focused AI.\nSocial Media Sentiment Tracker App\nAn interactive Python dashboard that scrapes Reddit posts, analyzes their sentiment using VADER, and\nvisualizes results in real time. Built with Streamlit, it features customizable search, detailed sentiment\nmetrics, and dynamic charts powered by Plotly.\nMEDIAMINT\nBachelor of Technology\nHigher Secondary Education\nSecondary Education\nTeam Pinterest- Associate\nComputer Science Engineer\nJava\nKIIT International School\nODM Public Sc

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
sentences = sent_tokenize(raw_text)
len(sentences)


14

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def top_k_sentences(text, k=15):
    sents = sent_tokenize(text)
    if len(sents) <= k:
        return sents
    vec = TfidfVectorizer().fit_transform(sents)
    scores = vec.sum(axis=1).A1
    top_idx = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)[:k]
    return [sents[i] for i in top_idx]

important_sentences = top_k_sentences(raw_text, 15)
important_sentences


['Passionate Computer Science student with skills in Python, cloud computing, and software testing.',
 'Creative\ndesigner as an extracurricular, with a proactive and adaptable approach to problem-solving.',
 'Eager to\ncontribute to projects that merge programming, testing, and design.',
 'Pneumonia Detection using Explainable AI\nDeveloped a CNN-based model for pneumonia detection from CT scans with high accuracy.',
 'Applied SHAP\nand LIME for interpretability, showcasing skills in Python, TensorFlow, and healthcare-focused AI.',
 'Social Media Sentiment Tracker App\nAn interactive Python dashboard that scrapes Reddit posts, analyzes their sentiment using VADER, and\nvisualizes results in real time.',
 'Built with Streamlit, it features customizable search, detailed sentiment\nmetrics, and dynamic charts powered by Plotly.',
 'MEDIAMINT\nBachelor of Technology\nHigher Secondary Education\nSecondary Education\nTeam Pinterest- Associate\nComputer Science Engineer\nJava\nKIIT Internati

In [None]:
model_name = "valhalla/t5-base-qg-hl"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
def generate_qa(context):
    input_text = f"generate questions: {context}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(inputs, max_length=128, num_return_sequences=1)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # answer is the sentence itself (cleaned)
    answer = context.strip()
    return question, answer

flashcards = []

for sent in important_sentences:
    try:
        q, a = generate_qa(sent)
        flashcards.append({"question": q, "answer": a})
    except:
        continue

flashcards[:5]


[{'question': 'What is a passion for computer science?',
  'answer': 'Passionate Computer Science student with skills in Python, cloud computing, and software testing.'},
 {'question': "What is a creative designer's job?",
  'answer': 'Creative\ndesigner as an extracurricular, with a proactive and adaptable approach to problem-solving.'},
 {'question': "What is one of the things that I'm interested in doing?",
  'answer': 'Eager to\ncontribute to projects that merge programming, testing, and design.'},
 {'question': 'What is the name of the model that was developed?',
  'answer': 'Pneumonia Detection using Explainable AI\nDeveloped a CNN-based model for pneumonia detection from CT scans with high accuracy.'},
 {'question': "What is the name of the program that demonstrates Python's skills?",
  'answer': 'Applied SHAP\nand LIME for interpretability, showcasing skills in Python, TensorFlow, and healthcare-focused AI.'}]

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def dedupe(cards):
    questions = [c["question"] for c in cards]
    embeddings = embedder.encode(questions, convert_to_tensor=True)
    keep = []
    used = set()

    for i, emb in enumerate(embeddings):
        if i in used:
            continue
        keep.append(cards[i])
        sims = util.cos_sim(emb, embeddings)[0]
        close = (sims > 0.78).nonzero().tolist()
        for idx in close:
            used.add(idx[0])
    return keep

flashcards = dedupe(flashcards)
len(flashcards)


14

In [None]:
df = pd.DataFrame(flashcards)
df


Unnamed: 0,question,answer
0,What is a passion for computer science?,Passionate Computer Science student with skill...
1,What is a creative designer's job?,"Creative\ndesigner as an extracurricular, with..."
2,What is one of the things that I'm interested ...,Eager to\ncontribute to projects that merge pr...
3,What is the name of the model that was developed?,Pneumonia Detection using Explainable AI\nDeve...
4,What is the name of the program that demonstra...,"Applied SHAP\nand LIME for interpretability, s..."
5,What is the name of the app that tracks sentim...,Social Media Sentiment Tracker App\nAn interac...
6,What is the name of the feature that allows yo...,"Built with Streamlit, it features customizable..."
7,What is the name of the school that I worked f...,MEDIAMINT\nBachelor of Technology\nHigher Seco...
8,What did I do to refine ad creatives and targe...,"I analyzed ad metrics, identified trends, and ..."
9,What did I manage for clients?,"I managed multiple client accounts, ensuring s..."


In [None]:
df.to_csv("flashcards.csv", index=False)
files.download("flashcards.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>