 Importation des modules nécessaires

In [1]:
import pdfplumber
import nltk
import PyPDF2
import spacy
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.chunk import ne_chunk
from sklearn.metrics import f1_score
import Levenshtein


Loading the PDF Document


In [2]:
# Load the PDF document
pdf_path = 'practice-standard-project-risk-management.pdf'
with open(pdf_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    
    # Extract text from all pages
    text = ''
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

print(text[:1000])  # Print the first 1000 characters to verify

   Project Management Institute  
  PRACTICE STANDARD 
FOR PROJECT RISK MANAGEMENT   ISBN: 978-1-933890-38-8 
 Published by: 
 Project Management Institute, Inc. 
  14 Campus Boulevard 
  Newtown Square, Pennsylvania 19073-3299 USA. 
  Phone: +610-356-4600 
  Fax: +610-356-4647 
  E-mail: customercare@pmi.org 
 Internet: www.pmi.org 
 ©2009 Project Management Institute, Inc. All rights reserved. 
 “PMI”, the PMI logo, “PMP”, the PMP logo, “PMBOK”, “PgMP”, “Project Management Journal”, “PM Network”, and the PMI 
Today logo are registered marks of Project Management Institute, Inc. The Quarter Globe Design is a trademark of the Project 
Management Institute, Inc. For a comprehensive list of PMI marks, contact the PMI Legal Department. 
 PMI Publications welcomes corrections and comments on its books. Please feel free to send comments on typographical, 
formatting, or other errors. Simply make a copy of the relevant page of the book, mark the error, and send it to: Book Editor, 
PMI Publi

Text Cleaning and Preprocessing

In [4]:
# Remove special characters and numbers
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  # Remove references
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words with digits
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub('http\S+\s', ' ', text)
    text = re.sub('RT|cc', ' ', text)
    text = re.sub('#\S+\s', ' ', text)
    text = re.sub('@\S+', '  ', text)  
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = re.sub(r'[^\x00-\x7f]', ' ', text) 
    text = re.sub('\s+', ' ', text)
    return text

cleaned_text = clean_text(text)
print(cleaned_text[:1000])  # Display cleaned text


 Project Management Institute PRACTICE STANDARD FOR PROJECT RISK MANAGEMENT ISBN Published by Project Management Institute Inc Campus Boulevard Newtown Square Pennsylvania USA Phone Fax Email customercarepmiorg Internet wwwpmiorg Project Management Institute Inc All rights reserved PMI the PMI logo PMP the PMP logo PMBOK PgMP Project Management Journal PM Network and the PMI Today logo are registered marks of Project Management Institute Inc The Quarter Globe Design is a trademark of the Project Management Institute Inc For a comprehensive list of PMI marks contact the PMI Legal Department PMI Publications welcomes corrections and comments on its books Please feel free to send comments on typographical formatting or other errors Simply make a copy of the relevant page of the book mark the error and send it to Book Editor PMI Publications Campus Boulevard Newtown Square PA USA To inquire about discounts for resale or educational purposes please contact the PMI Book Service Center PMI Bo

  text = re.sub('http\S+\s', ' ', text)
  text = re.sub('#\S+\s', ' ', text)
  text = re.sub('@\S+', '  ', text)
  text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
  text = re.sub('\s+', ' ', text)


Tokenization and Normalization

In [13]:
import nltk
#nltk.download('all')



[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       tagge

True

In [14]:
try:
    tokens = word_tokenize(cleaned_text.lower())
    print(tokens[:50])  # Display the first 50 tokens
except Exception as e:
    print(f"An error occurred during tokenization: {e}")


['project', 'management', 'institute', 'practice', 'standard', 'for', 'project', 'risk', 'management', 'isbn', 'published', 'by', 'project', 'management', 'institute', 'inc', 'campus', 'boulevard', 'newtown', 'square', 'pennsylvania', 'usa', 'phone', 'fax', 'email', 'customercarepmiorg', 'internet', 'wwwpmiorg', 'project', 'management', 'institute', 'inc', 'all', 'rights', 'reserved', 'pmi', 'the', 'pmi', 'logo', 'pmp', 'the', 'pmp', 'logo', 'pmbok', 'pgmp', 'project', 'management', 'journal', 'pm', 'network']


Stopword Removal and Lemmatization

In [15]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Remove stopwords and apply lemmatization
processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

print(processed_tokens[:50])  # Display the first 50 processed tokens


['project', 'management', 'institute', 'practice', 'standard', 'project', 'risk', 'management', 'isbn', 'published', 'project', 'management', 'institute', 'inc', 'campus', 'boulevard', 'newtown', 'square', 'pennsylvania', 'usa', 'phone', 'fax', 'email', 'customercarepmiorg', 'internet', 'wwwpmiorg', 'project', 'management', 'institute', 'inc', 'right', 'reserved', 'pmi', 'pmi', 'logo', 'pmp', 'pmp', 'logo', 'pmbok', 'pgmp', 'project', 'management', 'journal', 'pm', 'network', 'pmi', 'today', 'logo', 'registered', 'mark']


Part-of-Speech (POS) Tagging

In [17]:
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')

# Perform POS tagging
pos_tags = pos_tag(processed_tokens)

print(pos_tags[:20])  # Display the first 20 tokens and their POS tags


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\saifz\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('project', 'NN'), ('management', 'NN'), ('institute', 'NN'), ('practice', 'NN'), ('standard', 'NN'), ('project', 'NN'), ('risk', 'NN'), ('management', 'NN'), ('isbn', 'NN'), ('published', 'VBN'), ('project', 'NN'), ('management', 'NN'), ('institute', 'NN'), ('inc', 'NN'), ('campus', 'NN'), ('boulevard', 'IN'), ('newtown', 'JJ'), ('square', 'NN'), ('pennsylvania', 'NN'), ('usa', 'IN')]


TF-IDF and Levenshtein Distance

In [18]:
# TF-IDF calculation
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform([' '.join(processed_tokens)])

# Display the top words by TF-IDF
tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), X.toarray()[0]))
sorted_tfidf = sorted(tfidf_scores.items(), key=lambda item: item[1], reverse=True)

print(sorted_tfidf[:20])  # Display top 20 words with highest TF-IDF scores

# Example Levenshtein distance calculation between two words
word1 = "project"
word2 = "management"
distance = Levenshtein.distance(word1, word2)
print(f'Levenshtein distance between "{word1}" and "{word2}": {distance}')


[('risk', 0.714855653041942), ('project', 0.4912494888428178), ('management', 0.29073076798356495), ('process', 0.12698093836929233), ('analysis', 0.10902403799383685), ('response', 0.10346594978238634), ('standard', 0.09534259008872792), ('plan', 0.08508150415989621), ('practice', 0.08337132317175759), ('pmp', 0.0795234159484457), ('objective', 0.0701174205136833), ('technique', 0.06413178705519815), ('may', 0.06156651557299022), ('identi', 0.06028387983188626), ('institute', 0.055153336867470405), ('chapter', 0.05130542964415852), ('action', 0.05002279390305456), ('stakeholder', 0.044037160444569394), ('impact', 0.041899434209396125), ('example', 0.04061679846829216)]
Levenshtein distance between "project" and "management": 8


Chunking for Syntactic Structure

In [19]:
# Perform chunking
chunked = ne_chunk(pos_tags)

print(chunked[:20])  # Display first 20 chunked items


[('project', 'NN'), ('management', 'NN'), ('institute', 'NN'), ('practice', 'NN'), ('standard', 'NN'), ('project', 'NN'), ('risk', 'NN'), ('management', 'NN'), ('isbn', 'NN'), ('published', 'VBN'), ('project', 'NN'), ('management', 'NN'), ('institute', 'NN'), ('inc', 'NN'), ('campus', 'NN'), ('boulevard', 'IN'), ('newtown', 'JJ'), ('square', 'NN'), ('pennsylvania', 'NN'), ('usa', 'IN')]


 Evaluating the Extraction Process