## PDF Analyzer V1

In [None]:
!sudo apt-get install poppler-utils
!sudo apt-get install tesseract-ocr
!sudo apt-get install tesseract-ocr-fra

In [None]:
!pip install pdf2image pytesseract spacy gensim rake-nltk transformers nltk numpy pandas pyLDAvis
!pip install "tensorflow[and-cuda]"
!python -m spacy download fr_core_news_lg

In [2]:
import pytesseract
import nltk
import re
from pdf2image import convert_from_path
import spacy
from gensim import corpora, models
from rake_nltk import Rake
from transformers import pipeline
from transformers import BartForConditionalGeneration, BartTokenizer

nltk.download('stopwords')
nltk.download('punkt')
french_stopwords = set(nltk.corpus.stopwords.words('french'))

# Explicitly specify the model and tokenizer
model_name = "sshleifer/distilbart-cnn-12-6"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Load spaCy model
nlp = spacy.load("fr_core_news_lg")

# Use the specified model and tokenizer in the pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zà-ÿ\s]', '', text)
    
    # Remove extra whitespaces
    text = ' '.join(text.split())
    
    # Remove standalone numbers (optional)
    text = re.sub(r'\b\d+\b', '', text)
    
    # Remove common OCR artifacts (you can expand this list based on your observations)
    artifacts = ["\n", "\x0c"]
    for artifact in artifacts:
        text = text.replace(artifact, ' ')
    
    return text


def convert_pdf_to_text(pdf_file):
    pages_images = convert_from_path(pdf_file)
    texts = [pytesseract.image_to_string(image, config='--psm 6', lang='fra') for image in pages_images]
    return " ".join(texts)

def summarize_text(text):
    summarized_text = summarizer(text, max_length=150, min_length=50, do_sample=False)
    return summarized_text[0]['summary_text']


def extract_all_keywords_from_topics(topics):
    all_keywords = [re.findall(r'\"(.*?)\"', topic) for topic in topics]
    # Flatten the list of keywords
    flat_keywords = [keyword for sublist in all_keywords for keyword in sublist]
    return list(set(flat_keywords))

def extract_topics(text):
    # Tokenization and preprocessing
    texts = [[word for word in doc.lower().split() if word not in french_stopwords and word not in set('.,!?()-')] for doc in text.split('\n') if doc]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # LDA Model
    lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=4)
    topics_list = [topic[1] for topic in topics]
    topics_keywords = extract_all_keywords_from_topics(topics_list)
    return topics_keywords

def extract_entities(text):
    doc = nlp(text)
    entities = {entity.label_: [] for entity in doc.ents}
    for entity in doc.ents:
        entities[entity.label_].append(entity.text)
    return entities

def extract_keywords(text):
    rake = Rake(stopwords=french_stopwords)
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

def process_pdf(pdf_file):
    """une analyse détaillé d'un document pdf

    Summary: Un résumé du contenu du document.
    Topics: Les sujets principaux discutés dans le document (basés sur le modèle LDA).
    Entities: Des entités nommées (comme les noms propres, les organisations, etc.) extraites du document.
    Keywords: Les mots-clés qui semblent être les plus pertinents ou fréquents dans le document.

    Args:
        pdf_file (str): le chemin du fichier pdf

    Returns:
        dict: un dictionnaire contenant les résultats de l'analyse
        
        {
            "Summary": str,
            "Topics": list[str],
            "Entities": dict[str, list[str]],
            "Keywords": list[str]
        }
    """
    text = convert_pdf_to_text(pdf_file)
    text = clean_text(text)
    
    summary = summarize_text(text)
    topics = extract_topics(text)
    entities = extract_entities(text)
    keywords = extract_keywords(text)
    
    result = {
        "Summary": summary,
        "Topics": topics,
        "Entities": entities,
        "Keywords": list(set(keywords)) # remove duplicate
    }
    
    return result

  from .autonotebook import tqdm as notebook_tqdm
2023-10-05 17:26:01.853945: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-05 17:26:01.853983: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-05 17:26:01.854000: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-05 17:26:01.863245: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Example Usage
pdf_file = "setup_dev_env_back.pdf"
result = process_pdf(pdf_file)

In [3]:
result

{'Summary': ' setup dev environment back date de création septembre type free prérequis docker desktop wsl ubuntu on window microsoft store ubuntu windows terminal les procédures à suivre wsl exécutez ces commandes suivantes dans votre powershell .',
 'Topics': ['ubuntu', 'wsl', 'conda', 'installer'],
 'Entities': {'MISC': ['dev environment',
   'free prérequis',
   'wsl',
   'powershell',
   'wsl',
   'wsl setversion distro name wsl setversion',
   'ubuntu',
   'ubuntu',
   'windows terminal',
   'ubuntu',
   'homebrew',
   'linux',
   'installers link',
   'dev environment back miniconda',
   'link',
   'atctivate docker',
   'ubuntu settings resources',
   'ubuntu setup',
   'dev environment back'],
  'ORG': ['microsoft'],
  'LOC': ['wsl'],
  'PER': ['wget wget httpsrepoanacondacomminicondaminicondalatestlinux xsh']},
 'Keywords': ['windows terminal',
  'wsl installer conda anaconda installers link setup dev environment back miniconda installer link recommandé brew install wget wget

### Visualize the topics

In [4]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

def visualize_topics(lda_model, corpus, dictionary):
    """
    Visualize the topics from a given LDA model using pyLDAvis.

    Parameters:
    - lda_model: A trained Gensim LDA model.
    - corpus: The corpus used for training the LDA model.
    - dictionary: The dictionary used for training the LDA model.

    Returns:
    - A pyLDAvis visualization object. Display this object in a Jupyter environment to see the interactive visualization.
    """
    vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
    return pyLDAvis.display(vis_data)


In [8]:
pdf_file = "setup_dev_env_back.pdf"
text = convert_pdf_to_text(pdf_file)
text = clean_text(text)
texts = [[word for word in doc.lower().split() if word not in french_stopwords and word not in set('.,!?()-')] for doc in text.split('\n') if doc]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

visualization = visualize_topics(lda_model, corpus, dictionary)
visualization