In [1]:
import os
import torch
import spacy
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from pathlib import Path
import fitz  # PyMuPDF
import docx  # python-docx
from datasets import load_dataset

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set pad_token to eos_token as GPT-2 lacks a padding token
tokenizer.pad_token = tokenizer.eos_token

# Load the CNN/Daily Mail dataset for fallback summaries
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Load spaCy model for NER (Named Entity Recognition)
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(file_path):
    try:
        doc = fitz.open(file_path)
        text = "".join([page.get_text() for page in doc])
        return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        return f"Error reading DOCX: {str(e)}"

def summarize_with_gpt2(text, prompt=None):
    """
    Summarizes the input text using GPT-2. If no text is provided, it uses a default article from the dataset.
    
    Parameters:
    - text (str): The input text to summarize.
    - prompt (str): An optional prompt to guide the summarization.

    Returns:
    - summary (str): The generated summary.
    """
    if not text:
        default_article = dataset["train"][0]["article"]
        text = default_article if not prompt else prompt + " " + default_article
    else:
        if prompt:
            text = prompt + " " + text

    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)

    # Generate the summary
    max_new_tokens = 1024
    input_length = inputs['input_ids'].shape[-1]
    max_tokens = min(input_length + max_new_tokens, 1024)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_tokens,
        num_return_sequences=1,
        no_repeat_ngram_size=1,
        temperature=0.8,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

def extract_key_insights(text):
    """
    Extract key insights from the text using spaCy (Named Entity Recognition).
    
    Parameters:
    - text (str): The input text to analyze.

    Returns:
    - insights (list): A list of key insights (named entities).
    """
    doc = nlp(text)
    insights = []
    for ent in doc.ents:
        insights.append({'text': ent.text, 'label': ent.label_})
    
    return insights

# Example file processing

def process_file(file_path, prompt=""):
    """
    Main function to process a file, extract text, summarize, and extract key insights.
    """
    # Extract text based on file extension
    file_extension = file_path.split('.')[-1].lower()
    
    if file_extension == 'pdf':
        file_content = extract_text_from_pdf(file_path)
    elif file_extension == 'docx':
        file_content = extract_text_from_docx(file_path)
    elif file_extension == 'txt':
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                file_content = f.read()
        except Exception as e:
            file_content = f"Error reading TXT file: {str(e)}"
    else:
        file_content = "Unsupported file type. Please upload a .txt, .pdf, or .docx file."
    
    # Generate summary and key insights if file content is valid
    if "Error" not in file_content:
        summary = summarize_with_gpt2(file_content, prompt)
        insights = extract_key_insights(file_content)
        
        return summary, insights
    else:
        return None, None

# Example usage
file_path = r'C:\Users\oumki\OneDrive\Documents\Test.txt'
summary, insights = process_file(file_path, prompt="Summarize this document")

# Print the results
if summary:
    print("Summary:")
    print(summary)
    print("\nKey Insights:")
    for insight in insights:
        print(f"{insight['text']} ({insight['label']})")
else:
    print("Error processing file.")


Summary:
Summarize this document 1. NUMBER ONE

Globalism can be broadly defined as the interconnectedness of various nations across continents, bringing them closer economically, socially, culturally, and informationally. It promotes deeper integration and cooperation among countries.”


2. NUMBER TWO

“Economic globalism refers to the integration of national economies into a global one. Over the last few decades, major events like China’s rise and the Soviet Union’s collapse have accelerated globalization. A key driver has been Information and Communication Technology, which has increased both the speed and reach of economic activities.”

3. NUMBER THREE

The Information and Communication Technology (ICT) revolution has been crucial to globalization. It functions as a general-purpose technology, enhancing productivity and fostering economic growth. ICT has also contributed to the emergence of the ‘Information Economy,’ where innovation drives the transformation of products and servic