In [6]:
import fitz  # PyMuPDF
import re
import spacy
import pymupdf
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pdfplumber
import pandas as pd
from tqdm import tqdm

In [7]:
df=pd.read_csv('df.csv')

-------------------------data cleaning--------------------------

In [None]:
import re

def remove_special_characters_and_brackets(text):
    # Remove content inside brackets, including the brackets themselves
    text_no_brackets = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
    
    # Regular expression to match special characters except punctuation
    pattern = r"[^a-zA-Z0-9\s.,!?;:]"
    
    # Replace special characters with an empty string
    cleaned_text = re.sub(pattern, "", text_no_brackets)
    
    return cleaned_text

In [None]:
df['processed_paragraphs'] = df['paragraphs'].apply(remove_special_characters_and_brackets)

In [None]:
# Function to remove sentences containing the specified phrases
def remove_unwanted_statements(paragraph):
    # Split the paragraph into sentences
    sentences = paragraph.split('. ')  # You can adjust the delimiter based on the text format
    
    # Keep only sentences that don't contain the unwanted phrases
    filtered_sentences = [sentence for sentence in sentences if 
                          "Project Management Institute" not in sentence and 
                          "Practice Standard for Project Risk Management" not in sentence and
                          "Chapter" not in sentence and 
                          "CHAPTER" not in sentence  and
                          "Figure" not in sentence ]
    
    # Recombine the sentences into a paragraph
    return '. '.join(filtered_sentences)

# Apply the function to each row in the preprocessed_paragraph column
df['processed_paragraphs'] = df['processed_paragraphs'].apply(remove_unwanted_statements)

In [None]:
# Remove paragraphs that match "noparagraph" or contain fewer than 2 sentences
df['processed_paragraphs'] = df['processed_paragraphs'].apply(lambda paragraph: None if paragraph == "noparagraph"
                                                               or len(paragraph.split('. ')) < 2 else paragraph)

# Drop rows where 'processed_paragraphs' is None (i.e., removed)
df = df.dropna(subset=['processed_paragraphs']).reset_index(drop=True)
df=pd.DataFrame(df)

-----------------------extract bigrams & trigrams----------------------------

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
from nltk.corpus import stopwords
# nltk.download('all')

# Example sentence

def extract_all_bigrams_and_trigrams(dataframe):
# Tokenize the sentence
        list_bigrams = []
        list_trigrams = []
        stop_words = set(stopwords.words('english'))
        for i in range(len(dataframe)):
            tokens = word_tokenize(dataframe.processed_paragraphs[i].lower())
            # Remove stop words from tokens
            filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
            # Generate bigrams (n=2)
            bigrams = list(ngrams(filtered_tokens, 2))
            
            # print("Bigrams:", bigrams[:10])

            # Generate trigrams (n=3)
            trigrams = list(ngrams(filtered_tokens, 3))
            
            # print("Trigrams:", trigrams[:10])

            # Count the frequency of bigrams
            bigram_freq = Counter(bigrams)
            # print("Bigram Frequencies:", bigram_freq)
            trigrams_freq = Counter(trigrams)
            list_bigrams.append(bigram_freq)
            list_trigrams.append(trigrams_freq)
        return list_bigrams,list_trigrams
            # print("trigrams Frequencies:", trigrams_freq)

In [9]:
import spacy
from collections import Counter

# Load spaCy English model for POS tagging
nlp = spacy.load("en_core_web_trf")

# Step 1: Function to filter bigrams containing only nouns
def filter_ngrams_nouns(bigram_counter):
    filtered_bigrams = Counter()  # Initialize an empty Counter to store filtered bigrams
    
    for bigram, freq in bigram_counter.items():
        doc = nlp(" ".join(bigram))  # POS tag the bigram
        
        # Check if both tokens in the bigram are nouns
        if all(token.pos_ == "NOUN" for token in doc):
            filtered_bigrams[bigram] = freq  # Add only noun-based bigrams to the new Counter
    
    return filtered_bigrams


  model.load_state_dict(torch.load(filelike, map_location=device))


In [10]:
list_bigrams,list_trigrams=extract_all_bigrams_and_trigrams(df)
list_filtered_bigrams = []
list_filtered_trigrams = []
for counter in list_bigrams:
    filtered_bigrams = filter_ngrams_nouns(counter)
    list_filtered_bigrams.append(filtered_bigrams)
for counter in list_trigrams:
    filtered_trigrams = filter_ngrams_nouns(counter)
    list_filtered_trigrams.append(filtered_trigrams)

print(list_filtered_bigrams)
print(list_filtered_trigrams)



In [11]:
df['bigram_dict'] = list_filtered_bigrams
df['trigram_dict'] = list_filtered_trigrams

In [23]:
def segment_paragraph_into_sentences(paragraph):
    doc = nlp(paragraph)
    sentences = [sent.text for sent in doc.sents]
    sentences = list(filter(lambda s: s.strip(), sentences))
    return sentences

# Apply the segmentation to each paragraph in the DataFrame
df['sentences'] = df['processed_paragraphs'].apply(segment_paragraph_into_sentences)

In [24]:
df.to_csv('df.csv')

In [12]:
def combine_all_bigrams(df, bigram_col):
    combined_counter = Counter()
    
    for dic_bigram in df[bigram_col]:
        combined_counter.update(dic_bigram)
    
    return combined_counter

combine_all_bigrams(df, "bigram_dict")

Counter({('risk', 'management'): 239,
         ('project', 'risk'): 155,
         ('project', 'management'): 63,
         ('risk', 'analysis'): 53,
         ('management', 'process'): 49,
         ('management', 'plan'): 44,
         ('risk', 'response'): 35,
         ('plan', 'risk'): 33,
         ('project', 'objectives'): 32,
         ('management', 'processes'): 30,
         ('risk', 'responses'): 29,
         ('practice', 'standard'): 26,
         ('project', 'manager'): 24,
         ('project', 'stakeholders'): 19,
         ('risks', 'process'): 18,
         ('project', 'team'): 16,
         ('risk', 'owner'): 16,
         ('response', 'planning'): 16,
         ('management', 'activities'): 15,
         ('breakdown', 'structure'): 15,
         ('analysis', 'process'): 15,
         ('templates', 'examples'): 15,
         ('management', 'project'): 13,
         ('probability', 'impact'): 12,
         ('management', 'planning'): 11,
         ('projects', 'objectives'): 11,
         

In [13]:
combine_all_bigrams(df, "trigram_dict")

Counter({('project', 'risk', 'management'): 120,
         ('risk', 'management', 'process'): 46,
         ('risk', 'management', 'plan'): 29,
         ('risk', 'management', 'processes'): 18,
         ('plan', 'risk', 'management'): 18,
         ('project', 'management', 'plan'): 15,
         ('risk', 'management', 'activities'): 14,
         ('risk', 'response', 'planning'): 14,
         ('plan', 'risk', 'responses'): 13,
         ('risk', 'analysis', 'process'): 12,
         ('risk', 'management', 'planning'): 11,
         ('project', 'management', 'processes'): 11,
         ('risk', 'breakdown', 'structure'): 10,
         ('risk', 'responses', 'process'): 9,
         ('effect', 'projects', 'objectives'): 7,
         ('risk', 'action', 'owner'): 7,
         ('effectiveness', 'project', 'risk'): 5,
         ('objectives', 'project', 'risk'): 4,
         ('risk', 'response', 'actions'): 4,
         ('management', 'project', 'risk'): 4,
         ('risk', 'management', 'project'): 4,
   

In [12]:
df.head()

Unnamed: 0,sections,ids,paragraphs,bigram_dict,trigram_dict
0,1.1 Purpose of the Practice Standard for Proje...,4,The purpose of the Practice Standard for Pro...,"{('purpose', 'practice'): 1, ('practice', 'sta...","{('purpose', 'practice', 'standard'): 1, ('pro..."
1,1.2 Project Risk Management Deﬁ nition,66,"The deﬁ nition of Project Risk Management, as...","{('project', 'risk'): 5, ('risk', 'management'...","{('nition', 'project', 'risk'): 1, ('project',..."
2,1.3 Role of Project Risk Management in Project...,79,Project Risk Management is not an optional ac...,"{('project', 'risk'): 11, ('risk', 'management...","{('project', 'risk', 'management'): 8, ('proje..."
3,1.4 Good Risk Management Practice,124,Project Risk Management is a valuable compone...,"{('project', 'risk'): 15, ('risk', 'management...","{('project', 'risk', 'management'): 15, ('comp..."
4,1.5 Critical Success Factors for Project Risk ...,157,Figure 1-2. Critical Success Factors for Proj...,"{('success', 'factors'): 2, ('factors', 'proje...","{('success', 'factors', 'project'): 2, ('proje..."


In [14]:
def combine_bigrams_trigrams(df):
    combined_counter = Counter()
    
    # Combine all bigrams and trigrams from each row
    for idx, row in df.iterrows():
        bigrams = row['bigram_dict']  # Access the Counter object directly
        trigrams = row['trigram_dict']
        
        # Update the combined Counter with bigrams and trigrams
        combined_counter.update(bigrams)
        combined_counter.update(trigrams)
    
    return combined_counter

In [15]:
combined_ngrams = combine_bigrams_trigrams(df)
combined_ngrams

Counter({('risk', 'management'): 239,
         ('project', 'risk'): 155,
         ('project', 'risk', 'management'): 120,
         ('project', 'management'): 63,
         ('risk', 'analysis'): 53,
         ('management', 'process'): 49,
         ('risk', 'management', 'process'): 46,
         ('management', 'plan'): 44,
         ('risk', 'response'): 35,
         ('plan', 'risk'): 33,
         ('project', 'objectives'): 32,
         ('management', 'processes'): 30,
         ('risk', 'responses'): 29,
         ('risk', 'management', 'plan'): 29,
         ('practice', 'standard'): 26,
         ('project', 'manager'): 24,
         ('project', 'stakeholders'): 19,
         ('risk', 'management', 'processes'): 18,
         ('plan', 'risk', 'management'): 18,
         ('risks', 'process'): 18,
         ('project', 'team'): 16,
         ('risk', 'owner'): 16,
         ('response', 'planning'): 16,
         ('project', 'management', 'plan'): 15,
         ('management', 'activities'): 15,
     

In [17]:
def get_top_ngrams(ngram_counter, top_n=20):
    return ngram_counter.most_common(top_n)

In [18]:
top_ngrams = get_top_ngrams(combined_ngrams, top_n=50)
top_ngrams

[(('risk', 'management'), 239),
 (('project', 'risk'), 155),
 (('project', 'risk', 'management'), 120),
 (('project', 'management'), 63),
 (('risk', 'analysis'), 53),
 (('management', 'process'), 49),
 (('risk', 'management', 'process'), 46),
 (('management', 'plan'), 44),
 (('risk', 'response'), 35),
 (('plan', 'risk'), 33),
 (('project', 'objectives'), 32),
 (('management', 'processes'), 30),
 (('risk', 'responses'), 29),
 (('risk', 'management', 'plan'), 29),
 (('practice', 'standard'), 26),
 (('project', 'manager'), 24),
 (('project', 'stakeholders'), 19),
 (('risk', 'management', 'processes'), 18),
 (('plan', 'risk', 'management'), 18),
 (('risks', 'process'), 18),
 (('project', 'team'), 16),
 (('risk', 'owner'), 16),
 (('response', 'planning'), 16),
 (('project', 'management', 'plan'), 15),
 (('management', 'activities'), 15),
 (('breakdown', 'structure'), 15),
 (('analysis', 'process'), 15),
 (('templates', 'examples'), 15),
 (('risk', 'management', 'activities'), 14),
 (('risk'

In [18]:
df =df.copy()

In [19]:
--------------------------


In [25]:
def get_entities(sent):
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""

    #############################################################

    for tok in nlp(sent):
        # Skip punctuation marks
        if tok.dep_ != "punct":
            # Handle compound words (e.g., "risk management")
            if tok.dep_ == "compound":
                prefix = tok.text
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " " + tok.text

            # Handle modifiers (e.g., "important project")
            if tok.dep_.endswith("mod"):
                modifier = tok.text
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " " + tok.text

            # Detect subject entities
            if "subj" in tok.dep_:
                ent1 = (modifier + " " + prefix + " " + tok.text).strip()
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # Detect object entities
            if "obj" in tok.dep_:
                ent2 = (modifier + " " + prefix + " " + tok.text).strip()

            # Update previous token info
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    #############################################################
    
    # Return cleaned entity strings
    return [ent1.strip(), ent2.strip()]


In [26]:
get_entities(" Project Risk Management is essential to successful project management.")

['Project Risk Management', 'successful project management']

In [25]:
df['sentences'][0] 

[' The purpose of the  Practice Standard for Project Risk Management  is to  provide a standard for project  management practitioners and other stakeholders that de nes the aspects of Project Risk Management that  are recognized as good practice on most projects most of the time and  provide a standard that is globally  applicable and consistently applied. This practice standard has a descriptive purpose rather than one used for  training or educational purposes.   The  Practice Standard for Project Risk Management  covers risk management as it is applied to single  projects only.',
 'Like the  PMBOK    Guide   Fourth Edition, this practice standard does not cover risk in programs  or portfolios of projects.',
 '  Chapter 11 of the  PMBOK    Guide   Fourth Edition, is the basis for the  Practice Standard for Project Risk  Management .',
 'This practice standard is consistent with that chapter, emphasizing the concepts and principles  relating to Project Risk Management.',
 'It is align

In [27]:
Entity_pairs =[]
for sents in tqdm(df["sentences"][0]):
    Entity_pairs.append(get_entities(sents))
len(Entity_pairs)

100%|██████████| 18/18 [00:01<00:00, 16.44it/s]


18

In [28]:

entity_pairs = []
for i in range (len(df["sentences"])):
  for i in tqdm(df["sentences"][i]):
    entity_pairs.append(get_entities(i))
entity_pairs

100%|██████████| 18/18 [00:01<00:00, 14.15it/s]
100%|██████████| 6/6 [00:00<00:00, 16.14it/s]
100%|██████████| 24/24 [00:01<00:00, 16.67it/s]
100%|██████████| 13/13 [00:00<00:00, 17.78it/s]
100%|██████████| 11/11 [00:00<00:00, 15.24it/s]
100%|██████████| 2/2 [00:00<00:00, 17.80it/s]
100%|██████████| 2/2 [00:00<00:00, 18.18it/s]
100%|██████████| 19/19 [00:01<00:00, 18.90it/s]
100%|██████████| 10/10 [00:00<00:00, 18.77it/s]
100%|██████████| 10/10 [00:00<00:00, 18.79it/s]
100%|██████████| 8/8 [00:00<00:00, 15.90it/s]
100%|██████████| 7/7 [00:00<00:00, 17.85it/s]
100%|██████████| 9/9 [00:00<00:00, 19.36it/s]
100%|██████████| 7/7 [00:00<00:00, 12.86it/s]
100%|██████████| 17/17 [00:00<00:00, 18.86it/s]
100%|██████████| 49/49 [00:02<00:00, 18.07it/s]
100%|██████████| 38/38 [00:02<00:00, 18.57it/s]
100%|██████████| 2/2 [00:00<00:00, 14.46it/s]
100%|██████████| 10/10 [00:00<00:00, 17.79it/s]
100%|██████████| 4/4 [00:00<00:00, 17.81it/s]
100%|██████████| 4/4 [00:00<00:00, 19.16it/s]
100%|███████

[['practice standard', 'rather  training'],
 ['practice standard', 'projects'],
 ['practice standard', 'Project Risk Management'],
 ['It', 'other PMI practice standards'],
 ['Introductory PMBOK Guide', 'Fourth  Edition'],
 ['Guide', 'Fourth Edition'],
 ['six  processes', 'Quantitative Risk Risks'],
 ['six  that', 'critical success process'],
 ['which', 'practice standard'],
 ['that', 'effective Project Risk Management'],
 ['several  Principles', 'now  future'],
 ['constantly  principles', 'more  stability'],
 ['Different  projects', 'different Project Risk Management'],
 ['that', 'large  projects'],
 ['more  practice', 'organizational  culture'],
 ['different risk that', 'practice standard'],
 ['often project principles', 'herein organizations processes'],
 ['Practitioners', 'good Risk Management practice'],
 ['Project Risk Management', 'identi risk management project'],
 ['also  objectives', 'negative Project Risk project'],
 ['it', 'positive projects objectives'],
 ['Project objectiv

In [41]:
import spacy
from spacy.matcher import Matcher

# Load the model
nlp = spacy.load('en_core_web_trf')

def get_longest_relation(sent):
    doc = nlp(sent)

    # Matcher class object 
    matcher = Matcher(nlp.vocab)

    # Define multiple patterns to capture different relation structures
    patterns = [
        [{'DEP': 'ROOT'}, {'DEP': 'prep', 'OP': "?"}, {'DEP': 'agent', 'OP': "?"}, {'POS': 'ADJ', 'OP': "?"}],
        [{'DEP': 'nsubj'}, {'DEP': 'ROOT'}, {'DEP': 'dobj'}],  # Subject-Verb-Object pattern
        [{'DEP': 'nsubjpass'}, {'DEP': 'ROOT'}, {'DEP': 'prep'}, {'DEP': 'pobj'}],  # Passive voice pattern
        [{'DEP': 'nsubj'}, {'DEP': 'ROOT'}, {'DEP': 'attr'}],  # Subject-Verb-Attribute pattern
    ]

    # Add each pattern to the matcher
    for i, pattern in enumerate(patterns):
        matcher.add(f"relation_pattern_{i+1}", [pattern])

    # Apply the matcher to the doc
    matches = matcher(doc)
    
    longest_relation = None
    max_length = 0

    # Iterate over all matches to find the longest one
    for match_id, start, end in matches:
        span = doc[start:end]  # Extract the matched span
        span_length = len(span.text)

        # Check if the current span is longer than the previous longest span
        if span_length > max_length:
            longest_relation = span.text
            max_length = span_length

    return longest_relation if longest_relation else None




In [29]:
from spacy.matcher import Matcher 
from spacy.tokens import Span 
def get_relation(sent):
    doc = nlp(sent)

    # Matcher class object 
    matcher = Matcher(nlp.vocab)

    # Define the pattern
    pattern = [{'DEP': 'ROOT'}, 
               {'DEP': 'prep', 'OP': "?"},
               {'DEP': 'agent', 'OP': "?"},  
               {'POS': 'ADJ', 'OP': "?"}]

    # Add the pattern to the matcher
    matcher.add("matching_1", [pattern])

    matches = matcher(doc)
    
    if matches:
        k = len(matches) - 1  # Get the last match
        span = doc[matches[k][1]:matches[k][2]]  # Extract the matched span
        return span.text
    else:
        return None

In [42]:
list_of_relations = []
for sents in df['sentences'][0]:
    list_of_relations.append(get_longest_relation(sents))

print(len(list_of_relations)) 

list_of_relations

18


['has',
 'cover',
 'is consistent',
 'aligned with other',
 'material',
 None,
 'are',
 'described in',
 None,
 'emphasizes',
 'stated at',
 'have more',
 'require different',
 'is',
 'be',
 'are many',
 'are applicable',
 'establish']

In [47]:
from tqdm import tqdm

list_of_relations = []

# Wrap the outer loop with tqdm to track progress
for i in tqdm(range(len(df["sentences"])), desc="Processing sentences"):
    for sents in df['sentences'][i]:
        relation = get_longest_relation(sents)
        list_of_relations.append(relation)  # Append the actual relation, not the tqdm object






















Processing sentences: 100%|██████████| 163/163 [00:54<00:00,  2.97it/s]


In [49]:
list_of_relations[0:50]

['has',
 'cover',
 'is consistent',
 'aligned with other',
 'material',
 None,
 'are',
 'described in',
 None,
 'emphasizes',
 'stated at',
 'have more',
 'require different',
 'is',
 'be',
 'are many',
 'are applicable',
 'establish',
 'includes',
 'states',
 'is',
 'objectives include scope',
 'aims',
 'orientation requires consideration',
 'is essential',
 'becomes',
 'address',
 'assume',
 'addresses',
 'builds upon',
 'scheduling provides dates',
 'explores',
 'requires',
 'adds',
 'provides',
 'is at',
 'mean',
 'be different',
 'be',
 'is applicable',
 'be',
 'continues',
 'balance between',
 'needs',
 'is true',
 'monitor',
 'plays',
 'reviewed',
 'is',
 'conducted in']

In [None]:
from tqdm import tqdm

# Assuming relations_list contains items to process
relations = []
for rel in tqdm(relations_list):  # Wrap tqdm around the iterable for the progress bar
    processed_rel = process_relation(rel)  # Process each relation (some function you define)
    relations.append(processed_rel)  # Append the processed relation to the list


In [33]:
len(list_of_relations)

899

In [36]:
pd.Series(list_of_relations).value_counts()[:50]




















 95%|█████████▌| 19/20 [00:17<00:00,  1.09it/s]A[A[A[A
 89%|████████▉ | 8/9 [01:22<00:10, 10.34s/it]
 50%|█████     | 1/2 [00:16<00:16, 16.12s/it]
 67%|██████▋   | 2/3 [00:16<00:08,  8.04s/it]
 86%|████████▌ | 6/7 [00:16<00:02,  2.67s/it]
 80%|████████  | 4/5 [00:15<00:03,  3.99s/it]
 80%|████████  | 4/5 [00:15<00:03,  3.98s/it]
 67%|██████▋   | 2/3 [00:15<00:07,  7.93s/it]
 80%|████████  | 4/5 [00:15<00:03,  3.95s/it]
 50%|█████     | 1/2 [00:15<00:15, 15.75s/it]
 92%|█████████▏| 11/12 [00:15<00:01,  1.42s/it]
 94%|█████████▍| 15/16 [00:15<00:01,  1.04s/it]
 67%|██████▋   | 2/3 [00:15<00:07,  7.77s/it]
 89%|████████▉ | 8/9 [00:15<00:01,  1.94s/it]
 89%|████████▉ | 8/9 [00:15<00:01,  1.93s/it]
 88%|████████▊ | 7/8 [00:15<00:02,  2.19s/it]
 75%|███████▌  | 3/4 [00:15<00:05,  5.09s/it]
 89%|████████▉ | 8/9 [00:15<00:01,  1.91s/it]
 75%|███████▌  | 3/4 [00:15<00:05,  5.07s/it]
 86%|████████▌ | 6/7 [00:15<00:02,  2.53s/it]
 67%|██████▋   | 2/3 [00:15<00:07,  7.56s/it]

is                                                              2
Hierarchy                                                       1
of                                                              1
organized in                                                    1
are                                                             1
3                                                               1
glossary of                                                     1
emphasizes                                                      1
stated at                                                       1
have more                                                       1
require different                                               1
be                                                              1
are many                                                        1
are applicable                                                  1
(A, t, t, e, n, t, i, o, n,  , t, o,  , o, v, e, r, a, l, l)    1
(I, n, s, 