In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from PyPDF2 import PdfReader
import numpy as np
import networkx as nx
import spacy
import torch
import random

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text




In [4]:
def preprocess_text(text):
    # Tokenize into sentences and words
    sentences = sent_tokenize(text)
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence)]

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return words



In [5]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = set()
    words1 = [w.lower() for w in sent1 if w.lower() not in stopwords]
    words2 = [w.lower() for w in sent2 if w.lower() not in stopwords]
    all_words = list(set(words1 + words2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    for w in words1:
        vector1[all_words.index(w)] = 1
    for w in words2:
        vector2[all_words.index(w)] = 1
    return 1 - cosine_distance(vector1, vector2)



In [6]:
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j], stop_words)
    return similarity_matrix



In [7]:
def generate_summary(text, top_n=3):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked_sentences[:top_n]])
    return summary



In [8]:
def perform_ner_spacy(text):
    doc = nlp(text)
    named_entities = [ent.text for ent in doc.ents]
    return named_entities

from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering



In [9]:
def generate_mcqs_using_distilbert(text, num_options=4):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')

    max_seq_length = tokenizer.model_max_length
    chunks = [text[i:i + max_seq_length - 2] for i in range(0, len(text), max_seq_length - 2)]

    generated_mcqs = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        outputs = model(input_ids, attention_mask=attention_mask)
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = tokenizer.decode(input_ids[0, answer_start:answer_end])

        # Generate options for MCQs
        options = [answer]
        while len(options) < num_options:
            random_option = generate_random_option(text, answer)
            if random_option not in options:
                options.append(random_option)

        # Shuffle the options to avoid any pattern
        random.shuffle(options)

        mcq = {
            "question": "What is " + answer + "?",
            "options": options,
            "answer": options.index(answer)
        }

        generated_mcqs.append(mcq)
    return generated_mcqs



In [10]:
def generate_random_option(text, correct_answer):
    words = set(word_tokenize(text.lower()))
    words.discard(correct_answer.lower())
    return random.choice(list(words))


num_options_per_question = 4  # Set the number of options per MCQ question
file_path = "C:/Users/Protectt067/OneDrive - PROTECTT AI LABS PVT LTD/python Dhanasekar/internship-assignment-nlp-main/Dataset/chapter_2.pdf"  # Replace with the actual file path
text = extract_text_from_pdf(file_path)
preprocessed_text = preprocess_text(text)
preprocessed_text_str = " ".join(preprocessed_text)  # Convert list of words to a single string
summarized_text = generate_summary(preprocessed_text_str)
generated_mcqs = generate_mcqs_using_distilbert(summarized_text, num_options=num_options_per_question)



Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
print("Generated MCQs:")
for i, mcq in enumerate(generated_mcqs):
    question = mcq['question']
    options = mcq['options']
    answer_index = mcq['answer']
    print(f"{i+1}. {question}")
    for j, option in enumerate(options):
        print(f"   {'ABCD'[j]}. {option}")
    print(f"   Correct Answer: {'ABCD'[answer_index]}")

Generated MCQs:
1. What is mughal ruler established control large part territory known india death many mughal governor subadars big zamindars began asserting authority establishing regional kingdom powerful regional kingdom emerged various part india delhi could longer function?
   A. maulvis
   B. eloquent
   C. arm
   D. mughal ruler established control large part territory known india death many mughal governor subadars big zamindars began asserting authority establishing regional kingdom powerful regional kingdom emerged various part india delhi could longer function
   Correct Answer: D
2. What is ?
   A. revenue
   B. 
   C. continuously
   D. away
   Correct Answer: B
3. What is ##land could compete east india company charter company could venture across ocean looking new land could buy good cheap price carry back europe sell higher price company fear competition english trading company mercantile trading company day made profit primarily excluding competition could buy cheap s