In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.11


In [None]:
import fitz
import re
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [None]:
# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:

cleaned_text = ""  # Initialize an empty string to accumulate all the cleaned text
footer_text = "©2009 Project Management Institute. Practice Standard for Project Risk Management"
current_chapter = None  # To track the current chapter

# Open the PDF document
pdf_document = fitz.open('practice-standard-project-risk-management.pdf')

# Loop through the pages starting from page 13
for page_number in range(13, pdf_document.page_count):
    page = pdf_document[page_number]
    text_dict = page.get_text("dict")

    if text_dict:
        # Remove footer and lines containing 'Figure'
        cleaned_page_text = []

        for block in text_dict["blocks"]:
            # Check if the block is a text block
            if "lines" in block:  # It's a text block
                for line in block["lines"]:
                    # Get the bounding box of the line to check its position
                    bbox = line["bbox"]

                    # Check if the line is not in the bottom-right area (e.g., last 50 units of height)
                    if bbox[3] < page.rect.height - 50:  # Avoid last 50 units height as footer
                        line_text = " ".join([span["text"] for span in line["spans"]])
                        # Exclude the footer text
                        if footer_text not in line_text and 'Figure' not in line_text:
                            cleaned_page_text.append(line_text)

        # Join all cleaned lines into a single string for this page
        cleaned_page_text = "\n".join(cleaned_page_text)

        # Split the cleaned text into lines for further processing
        lines = cleaned_page_text.split('\n')

        for line in lines:
            # Strip whitespace for comparison
            stripped_line = line.strip()

            # Skip lines that are only numbers
            if stripped_line.isdigit():
                continue  # Skip this line

            # Use regex to match chapter titles while ignoring any trailing page numbers
            chapter_match = re.match(r'^(CHAPTER\s+\d+)', stripped_line)

            # Check if the line contains a chapter title
            if chapter_match:
                # Extract the chapter title without page numbers
                chapter_title = chapter_match.group(1)

                # Only add the chapter title if it's different from the current one
                if current_chapter is None or chapter_title != current_chapter:
                    current_chapter = chapter_title  # Update the current chapter
                    cleaned_text += chapter_title + "\n"  # Include the chapter title in the cleaned text
            else:
                # Include other lines only if they are not empty
                if stripped_line:  # Avoid adding empty lines
                    cleaned_text += stripped_line + "\n"  # Add non-title lines

# Close the PDF document
pdf_document.close()

# Output the cleaned text
print(cleaned_text)

CHAPTER 1
1.1 Purpose of the  Practice Standard for Project Risk Management
The purpose of the   Practice Standard for Project Risk Management   is to (  a   ) provide a standard for project
management practitioners and other stakeholders that deﬁ nes the aspects of Project Risk Management that
are recognized as good practice on most projects most of the time and (  b   ) provide a standard that is globally
applicable and consistently applied. This practice standard has a descriptive purpose rather than one used for
training or educational purposes.
The   Practice Standard for Project Risk Management   covers risk management as it is applied to single
projects only. Like the   PMBOK    ®    Guide   – Fourth Edition, this practice standard does not cover risk in programs
or portfolios of projects.
Chapter 11 of the   PMBOK    ®    Guide –   Fourth Edition, is the basis for the   Practice Standard for Project Risk
Management  . This practice standard is consistent with that chapter, emph

In [None]:


def divide_into_sections_and_phrases_df(text):
    # Initialize variables
    sections = text.split('CHAPTER')
    data = []  # To hold the data for the DataFrame
    full_text = ""  # To hold the full text

    for section in sections[1:]:  # Skip the first split part (before the first chapter)
        lines = section.split('\n')
        if len(lines) == 0:
            continue  # Skip empty sections

        current_section = None  # Initialize to None
        chapter_title = lines[0].strip()  # Extract chapter title
        paragraph = ""  # Initialize a variable for the paragraph

        # Add chapter title to full text
        full_text += "CHAPTER " + chapter_title + "\n"

        for line in lines[1:]:  # Start from the second line to skip the chapter title
            line = line.strip()  # Strip whitespace from each line

            # Filter criteria for valid section headers
            if line and len(line) > 1 and not all(char in ' .-_•' for char in line):
                # Check for valid section headers
                if re.match(r'^\d+(\.\d+)?\s', line) or re.match(r'^APPENDIX\s+\w*', line):
                    if current_section is not None:  # If there is a current section, process it
                        sentences = split_into_sentences(paragraph)  # Get sentences for the paragraph
                        phrases = split_into_phrases(paragraph)  # Get phrases for the paragraph
                        data.append({
                            'Chapter': chapter_title,
                            'Section Header': current_section,
                            'Paragraph': paragraph,
                            'Sentences': sentences,  # Include array of sentences
                            #'Phrases': phrases
                        })
                        paragraph = ""  # Reset paragraph for the next section

                    current_section = line  # Update current_section to the new section header
                    full_text += line + "\n"  # Add line to full text

                # Add lines to the paragraph
                if current_section is not None:
                    paragraph += line + " "  # Add line to the paragraph

        # Add the last section if it exists
        if paragraph and current_section:  # If there is a remaining paragraph
            sentences = split_into_sentences(paragraph)  # Get sentences for the paragraph
            phrases = split_into_phrases(paragraph)  # Get phrases for the paragraph
            data.append({
                'Chapter': chapter_title,
                'Section Header': current_section,
                'Paragraph': paragraph,
                'Sentences': sentences,  # Include array of sentences
                #'Phrases': phrases
            })

            # Also include the last paragraph in the full text
            full_text += paragraph + "\n"

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    return df, full_text.strip()  # Return the DataFrame and full text

def split_into_phrases(paragraph):
    # Split paragraph into phrases
    phrases = []
    sentences = re.split(r'[.!?]', paragraph)  # Split by sentence-ending punctuation
    for sentence in sentences:
        if sentence:
            phrases.extend(re.split(r'[,:;]', sentence))  # Split by commas, colons, and semicolons
    return [phrase.strip() for phrase in phrases if phrase.strip()]  # Clean and return non-empty phrases

def split_into_sentences(paragraph):
    # Split paragraph into sentences
    return [sentence.strip() for sentence in re.split(r'[.!?]', paragraph) if sentence.strip()]  # Clean and return non-empty sentences


In [None]:

df, summary = divide_into_sections_and_phrases_df(cleaned_text)


df['Sentences'][0]

['1',
 '1 Purpose of the  Practice Standard for Project Risk Management The purpose of the   Practice Standard for Project Risk Management   is to (  a   ) provide a standard for project management practitioners and other stakeholders that deﬁ nes the aspects of Project Risk Management that are recognized as good practice on most projects most of the time and (  b   ) provide a standard that is globally applicable and consistently applied',
 'This practice standard has a descriptive purpose rather than one used for training or educational purposes',
 'The   Practice Standard for Project Risk Management   covers risk management as it is applied to single projects only',
 'Like the   PMBOK    ®    Guide   – Fourth Edition, this practice standard does not cover risk in programs or portfolios of projects',
 'Chapter 11 of the   PMBOK    ®    Guide –   Fourth Edition, is the basis for the   Practice Standard for Project Risk Management',
 'This practice standard is consistent with that chap

In [None]:
full_text = "\n".join(cleaned_text.split('\n'))


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
doc = nlp(full_text)

print(doc)

CHAPTER 1
1.1 Purpose of the  Practice Standard for Project Risk Management
The purpose of the   Practice Standard for Project Risk Management   is to (  a   ) provide a standard for project
management practitioners and other stakeholders that deﬁ nes the aspects of Project Risk Management that
are recognized as good practice on most projects most of the time and (  b   ) provide a standard that is globally
applicable and consistently applied. This practice standard has a descriptive purpose rather than one used for
training or educational purposes.
The   Practice Standard for Project Risk Management   covers risk management as it is applied to single
projects only. Like the   PMBOK    ®    Guide   – Fourth Edition, this practice standard does not cover risk in programs
or portfolios of projects.
Chapter 11 of the   PMBOK    ®    Guide –   Fourth Edition, is the basis for the   Practice Standard for Project Risk
Management  . This practice standard is consistent with that chapter, emph

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuations
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\[.*?\]', '', text)  # Remove references
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words with digits
    text = re.sub('http\S+\s', ' ', text)

    # Tokenize, lemmatize, and remove stop words using spaCy
    doc = nlp(text)

    # Return lemmatized tokens that are not stop words or punctuations
    tokens = [token.lemma_.strip() for token in doc if not token.is_stop and not token.is_punct]

    return tokens

In [None]:
def extract_most_common_ngrams(text, n=3, top_n=3):
    """Extract the most common noun n-grams from the text, keeping the longer one when counts are the same."""
    processed_tokens = preprocess_text(text)

    ngrams = []
    ngram_docs = []

    # Extract n-grams from the preprocessed tokens
    for i in range(len(processed_tokens)):
        for j in range(2, n + 1):  # From bi-grams (2) to n-grams (n)
            if i + j <= len(processed_tokens):
                ngram = " ".join(processed_tokens[i:i+j]).strip()

                # Perform POS tagging to check if all tokens in the n-gram are nouns
                ngram_doc = nlp(ngram)
                if all(token.pos_ == 'NOUN' for token in ngram_doc):  # Only add n-grams where all tokens are nouns
                    ngrams.append(ngram)

    # Count the occurrences of each noun n-gram
    ngram_counts = Counter(ngrams)

    # Create a dictionary to keep the longest n-gram for each count
    unique_ngrams = {}
    for ngram, count in ngram_counts.items():
        if count >= 10:  # Only consider n-grams that occur 10 times or more
            if count not in unique_ngrams:
                unique_ngrams[count] = ngram  # If count not in unique_ngrams, add n-gram
            else:
                # If the same count exists, keep the longer n-gram
                if len(ngram) > len(unique_ngrams[count]):
                    unique_ngrams[count] = ngram

    # Create a list of (ngram, count) tuples, filtering out those with count < 10
    filtered_ngrams = [(ngram, count) for count, ngram in unique_ngrams.items() if count >= 10]

    # Sort by count in descending order and get the most common n-grams
    most_common = sorted(filtered_ngrams, key=lambda x: x[0], reverse=True)[:top_n]

    return most_common, processed_tokens

In [None]:
from collections import Counter
most_common_ngrams, tokens = extract_most_common_ngrams(full_text, n=4, top_n=150)

# Display the most common n-grams
print("Most Common N-grams:")
for phrase, count in most_common_ngrams:
    print(f" ******'{phrase}' occurs {count} times \n")


In [None]:
def calculate_tfidf_scores(text, ngrams):
    """Calculate TF-IDF scores for the provided n-grams."""
    vectorizer = TfidfVectorizer(ngram_range=(2, 4))  # Bi-grams to 4-grams
    X = vectorizer.fit_transform([text])  # Fit TF-IDF on the full text

    # Get the TF-IDF scores for the n-grams
    tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), X.toarray()[0]))

    # Retrieve TF-IDF scores for the specified n-grams
    ngram_tfidf_scores = [(ngram, tfidf_scores.get(ngram, 0)) for ngram, _ in ngrams]

    return ngram_tfidf_scores

In [None]:
ngram_tfidf_scores = calculate_tfidf_scores(full_text, most_common_ngrams)
ngram_tfidf_scores

[('weakness', 0),
 ('trigger condition', 0.00550723586437476),
 ('tool technique', 0.001101447172874952),
 ('technique example template', 0),
 ('technique', 0),
 ('strength', 0),
 ('stakeholder', 0),
 ('risk response', 0.047362228433622935),
 ('risk monitoring control', 0),
 ('risk management process', 0.0550723586437476),
 ('risk management planning', 0.015420260420249328),
 ('risk management plan', 0.04515933408787303),
 ('risk management activity', 0.001101447172874952),
 ('risk management', 0.3601732255301093),
 ('risk identi', 0),
 ('risk breakdown structure', 0.018724601938874184),
 ('risk analysis', 0.12776787205349444),
 ('risk action owner', 0.012115918901624471),
 ('risk', 0),
 ('response strategy', 0.009913024555874568),
 ('response process', 0),
 ('response', 0),
 ('purpose objective', 0),
 ('project team', 0.024231837803248942),
 ('project risk management process', 0.03634775670487342),
 ('project risk management', 0.18394167787011698),
 ('project risk', 0.2511299554154890

In [None]:
filtered_ngrams = [(ngram, score) for ngram, score in ngram_tfidf_scores if score > 0.001]
filtered_ngrams


[('trigger condition', 0.00550723586437476),
 ('tool technique', 0.001101447172874952),
 ('risk response', 0.047362228433622935),
 ('risk management process', 0.0550723586437476),
 ('risk management planning', 0.015420260420249328),
 ('risk management plan', 0.04515933408787303),
 ('risk management activity', 0.001101447172874952),
 ('risk management', 0.3601732255301093),
 ('risk breakdown structure', 0.018724601938874184),
 ('risk analysis', 0.12776787205349444),
 ('risk action owner', 0.012115918901624471),
 ('response strategy', 0.009913024555874568),
 ('project team', 0.024231837803248942),
 ('project risk management process', 0.03634775670487342),
 ('project risk management', 0.18394167787011698),
 ('project risk', 0.25112995541548905),
 ('project plan', 0.007710130210124664),
 ('project objective', 0.015420260420249328),
 ('project manager', 0.030840520840498657),
 ('project management process', 0.001101447172874952),
 ('project management plan', 0.023130390630373993),
 ('projec

In [None]:
def custom_tokenizer(nlp, most_common_phrases):
    """Create a custom tokenizer that treats 2-grams, 3-grams, and 4-grams as single tokens."""
    # Sort phrases by length (longer phrases first) to avoid matching parts of multi-word phrases
    most_common_phrases = sorted(most_common_phrases, key=lambda x: len(x[0].split()), reverse=True)

    # Compile the regex patterns from spaCy's defaults
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)

    # Define the tokenizer function that replaces phrases with underscores
    def custom_tokenizer_function(text):
        # Preprocess text to ensure matching (e.g., lowercase)
        text = text.lower()
        # Replace phrases in the text with underscore versions
        for phrase, _ in most_common_phrases:
            # Ensure the phrase boundaries are matched
            text = re.sub(r'\b' + re.escape(phrase.lower()) + r'\b', phrase.replace(" ", "_"), text)
        # Use spaCy's default tokenizer to handle the rest
        return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer)(text)

    return custom_tokenizer_function

In [None]:
from spacy.tokenizer import Tokenizer

nlp.tokenizer = custom_tokenizer(nlp, filtered_ngrams)

In [None]:
def tokenize_sentences(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        # Tokenize each sentence, removing punctuation, stopwords, and space tokens
        tokens = [token.text for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
        tokenized_sentences.append(tokens)
    return tokenized_sentences

In [None]:
df['Tokenized Sentences'] = df['Sentences'].apply(tokenize_sentences)

In [None]:
df['Tokenized Sentences'][0]

In [None]:
csv_file_path = 'dataframe.csv'
df.to_csv(csv_file_path, sep=';', index=False)

In [None]:
df = pd.read_csv("dataframe.csv", delimiter=';')

In [None]:
df

Unnamed: 0,Chapter,Section Header,Paragraph,Sentences,Tokenized Sentences
0,1,1.1 Purpose of the Practice Standard for Proj...,1.1 Purpose of the Practice Standard for Proj...,"['1', '1 Purpose of the Practice Standard for...","[['1'], ['1', 'purpose', 'practice', 'standard..."
1,1,1.2 Project Risk Management Deﬁ nition,1.2 Project Risk Management Deﬁ nition The deﬁ...,"['1', '2 Project Risk Management Deﬁ nition Th...","[['1'], ['2', 'project_risk_management', 'deﬁ'..."
2,1,1.3 Role of Project Risk Management in Project...,1.3 Role of Project Risk Management in Project...,"['1', '3 Role of Project Risk Management in Pr...","[['1'], ['3', 'role', 'project_risk_management..."
3,1,1.4 Good Risk Management Practice,1.4 Good Risk Management Practice Project Risk...,"['1', '4 Good Risk Management Practice Project...","[['1'], ['4', 'good', 'risk_management', 'prac..."
4,1,1.5 Critical Success Factors for Project Risk ...,1.5 Critical Success Factors for Project Risk ...,"['1', '5 Critical Success Factors for Project ...","[['1'], ['5', 'critical', 'success', 'factors'..."
...,...,...,...,...,...
83,9,APPENDIX D,APPENDIX D Technique Strengths Weaknesses CSFs...,['APPENDIX D Technique Strengths Weaknesses CS...,"[['appendix', 'd', 'technique', 'strengths', '..."
84,9,APPENDIX D,APPENDIX D Technique Strengths Weaknesses CSFs...,['APPENDIX D Technique Strengths Weaknesses CS...,"[['appendix', 'd', 'technique', 'strengths', '..."
85,9,APPENDIX D,APPENDIX D D.6.1.2 Risk Audits Risk audits are...,"['APPENDIX D D', '6', '1', '2 Risk Audits Risk...","[['appendix', 'd', 'd'], ['6'], ['1'], ['2', '..."
86,9,APPENDIX D,APPENDIX D Risks responded to in the last peri...,['APPENDIX D Risks responded to in the last pe...,"[['appendix', 'd', 'risks', 'responded', 'peri..."


In [None]:
df['Tokenized Sentences'][0]

"[['1'], ['1', 'purpose', 'practice', 'standard', 'project_risk_management', 'purpose', 'practice', 'standard', 'project_risk_management', 'provide', 'standard', 'project_management', 'practitioners', 'stakeholders', 'deﬁ', 'nes', 'aspects', 'project_risk_management', 'recognized', 'good', 'practice', 'projects', 'time', 'b', 'provide', 'standard', 'globally', 'applicable', 'consistently', 'applied'], ['practice', 'standard', 'descriptive', 'purpose', 'training', 'educational', 'purposes'], ['practice', 'standard', 'project_risk_management', 'covers', 'risk_management', 'applied', 'single', 'projects'], ['like', 'pmbok', '®', 'guide', 'fourth', 'edition', 'practice', 'standard', 'cover', 'risk', 'programs', 'portfolios', 'projects'], ['chapter', '11', 'pmbok', '®', 'guide', 'fourth', 'edition', 'basis', 'practice', 'standard', 'project_risk_management'], ['practice', 'standard', 'consistent', 'chapter', 'emphasizing', 'concepts', 'principles', 'relating', 'project_risk_management'], ['

In [None]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers, keybert
Successfully installed keybert-0.8.5 sentence-transformers-3.1.1


In [None]:
from keybert import KeyBERT
from nltk.corpus import wordnet
import nltk

# Download WordNet data
nltk.download('wordnet')
nltk.download('omw-1.4')

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import pandas as pd
from keybert import KeyBERT
from nltk.corpus import wordnet
import nltk
import ast

# Ensure WordNet is downloaded
nltk.download('wordnet')

# Initialize KeyBERT model
model = KeyBERT()

# Function to get synonyms from WordNet
def get_synonyms(keyword):
    synonyms = set()
    for syn in wordnet.synsets(keyword):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)[:5]

# Function to extract keywords and their synonyms from tokenized sentences
def extract_keywords_and_synonyms(tokenized_sentences):
    keywords_per_sentence = []
    synonyms_per_sentence = []

    # Convert string representation of list to actual list
    tokenized_sentences = ast.literal_eval(tokenized_sentences)

    for sentence_tokens in tokenized_sentences:
        # Join tokens correctly as a single string
        text = ' '.join(sentence_tokens).strip()  # Join as a single string

        print(f"Text for keyword extraction: '{text}'")  # Debugging statement

        if text:  # Ensure that text is not empty
            # Extract keywords using KeyBERT
            keywords = model.extract_keywords(text, top_n=5, use_maxsum=True, diversity=0.7)
            print(f"Extracted keywords: {keywords}")  # Debugging statement

            # Get only the keyword text (first element of each tuple)
            keywords_list = [kw[0] for kw in keywords]
            keywords_per_sentence.append(keywords_list)

            # Get synonyms for each keyword
            synonyms_list = [get_synonyms(kw) for kw in keywords_list]
            synonyms_per_sentence.append(synonyms_list)
        else:
            keywords_per_sentence.append([])  # Append an empty list if no text
            synonyms_per_sentence.append([])   # Append an empty list if no text

    return keywords_per_sentence, synonyms_per_sentence

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df['Keywords'], df['Synonyms'] = zip(*df['Tokenized Sentences'].apply(extract_keywords_and_synonyms))

Text for keyword extraction: '1'
Extracted keywords: []
Text for keyword extraction: '1 purpose practice standard project_risk_management purpose practice standard project_risk_management provide standard project_management practitioners stakeholders deﬁ nes aspects project_risk_management recognized good practice projects time b provide standard globally applicable consistently applied'
Extracted keywords: [('nes', -0.0293), ('consistently', 0.0693), ('provide', 0.0862), ('aspects', 0.1599), ('project_risk_management', 0.6121)]
Text for keyword extraction: 'practice standard descriptive purpose training educational purposes'
Extracted keywords: [('standard', 0.2118), ('descriptive', 0.322), ('practice', 0.3741), ('purposes', 0.4388), ('training', 0.5164)]
Text for keyword extraction: 'practice standard project_risk_management covers risk_management applied single projects'
Extracted keywords: [('single', 0.0522), ('standard', 0.1659), ('practice', 0.1818), ('covers', 0.2196), ('projec

In [None]:
df['Keywords'][0]

[[],
 ['nes', 'consistently', 'provide', 'aspects', 'project_risk_management'],
 ['standard', 'descriptive', 'practice', 'purposes', 'training'],
 ['single', 'standard', 'practice', 'covers', 'project_risk_management'],
 ['fourth', 'like', 'practice', 'pmbok', 'portfolios'],
 ['11', 'practice', 'pmbok', 'basis', 'project_risk_management'],
 ['relating',
  'emphasizing',
  'standard',
  'practice',
  'project_risk_management'],
 [],
 [],
 ['practice', 'standard', 'main', 'organized', 'sections'],
 ['including', 'ned', 'introductory', 'pmbok', 'project_risk_management'],
 [],
 ['fourth', 'underlying', 'processes', 'pmbok', 'project_risk_management'],
 ['monitor', 'perform', 'plan', 'qualitative', 'risk_management'],
 ['addresses', 'factors', 'critical', 'success', 'documenting'],
 [],
 [],
 ['fundamental',
  'successful',
  'standard',
  'practice',
  'project_risk_management'],
 ['stated', 'general', 'level', 'reasons', 'principles'],
 ['agreed', 'expected', 'valid', 'future', 'principl

In [None]:
df['Tokenized Sentences'][0]

"[['1'], ['1', 'purpose', 'practice', 'standard', 'project_risk_management', 'purpose', 'practice', 'standard', 'project_risk_management', 'provide', 'standard', 'project_management', 'practitioners', 'stakeholders', 'deﬁ', 'nes', 'aspects', 'project_risk_management', 'recognized', 'good', 'practice', 'projects', 'time', 'b', 'provide', 'standard', 'globally', 'applicable', 'consistently', 'applied'], ['practice', 'standard', 'descriptive', 'purpose', 'training', 'educational', 'purposes'], ['practice', 'standard', 'project_risk_management', 'covers', 'risk_management', 'applied', 'single', 'projects'], ['like', 'pmbok', '®', 'guide', 'fourth', 'edition', 'practice', 'standard', 'cover', 'risk', 'programs', 'portfolios', 'projects'], ['chapter', '11', 'pmbok', '®', 'guide', 'fourth', 'edition', 'basis', 'practice', 'standard', 'project_risk_management'], ['practice', 'standard', 'consistent', 'chapter', 'emphasizing', 'concepts', 'principles', 'relating', 'project_risk_management'], ['

In [None]:
df['Synonyms'][0]

[[],
 [['Ne', 'Cornhusker_State', 'northeast', 'neon', 'atomic_number_10'],
  ['systematically', 'consistently'],
  ['supply', 'leave', 'put_up', 'cater', 'render'],
  ['view', 'facial_expression', 'aspect', 'look', 'facet'],
  []],
 [['monetary_standard', 'received', 'measure', 'banner', 'stock'],
  ['descriptive'],
  ['recitation', 'exercise', 'apply', 'commit', 'do'],
  ['purpose', 'aim', 'propose', 'purport', 'resolve'],
  ['breeding', 'develop', 'train', 'school', 'rail']],
 [['individual', 'I', 'single', 'unmarried', 'undivided'],
  ['monetary_standard', 'received', 'measure', 'banner', 'stock'],
  ['recitation', 'exercise', 'apply', 'commit', 'do'],
  ['covering', 'embrace', 'shroud', 'covert', 'handle'],
  []],
 [['4th', 'one-quarter', 'one-fourth', 'fourth_part', 'fourth'],
  ['ilk', 'alike', 'the_likes_of', 'comparable', 'same'],
  ['recitation', 'exercise', 'apply', 'commit', 'do'],
  [],
  ['portfolio']],
 [['eleven', '11', 'XI', 'xi'],
  ['recitation', 'exercise', 'apply',

In [None]:
csv_file_path = 'dataframeKeyWords.csv'
df.to_csv(csv_file_path, sep=';', index=False)