# A Knowledge based Recommendation System for Project Mangement Ontology learning approach

The procedure of ontology construction can be done in one of three ways: manual construction; cooperative construction (need the human intervention during the constructing process) and (semi-) automatic construction which considered the Ontology Learning (OL) approach. OL from text is the process for acquiring and representing knowledge from text [structured (database), semi-structured (XML file) and unstructured (.txt, pdf, etc)] to be in machine-understandable form [OWL, RDF (Resource Description Framework), or RDFS (Resource Description Framework Schema)], by applying a set of methods and techniques (NLP, data mining, and machine learning).

-Natural Language Processing via NLTK and Spacy matcher linguistic-based preprocessing technique: (1)Tokénization and normalization (2) part-of –speech tagging (POS), (3)posTagger, (4) stopwords, Lemmatization (Stemming), (5) chunking. levenshetein measure, TF-IDF measure, leveithen measure, cosine similarity measure, topic modeling LDA, n-gram, -Recommenadtions techniques based on ML : hiaachical clutering, classification, KNN, etc -Performance measures: Precision, Recall and F measure, -Programmation: Python, java; -Semantic web language/Tool: OWL2, RDF and SWRL

# Imports

In [1]:
import sys, fitz
import pandas as pd
import re
import string
import pandas as pd

import spacy
from spacy.matcher import Matcher
from spacy.tokens import span
from spacy import displacy


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer,SnowballStemmer
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag, RegexpParser

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer 


from nltk.corpus import stopwords

from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, OWL, RDF, RDFS, SKOS, VOID, XMLNS
from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.extras import describer
from rdflib.namespace import XSD


nltk.download('conll2000')
from nltk.corpus import conll2000
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from keybert import KeyBERT


#kw_model = KeyBERT(model='all-mpnet-base-v2')


#stop = stopwords.words('english')

#nltk.download('omw-1.4')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\Farjo\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


# Util Funtions For Text-Preprocessing

In [2]:
def remove_punctuation(text):
    """
        Remove the punctuation
    """
    return re.sub(r'[]!"$%&\'()*+/:.;=#@?[\\^_`{|}~-]+', " ", text)

In [3]:
def remove_number(text):
    """
        Remove the number
    """
    pattern = r'[0-9]'
    # Match all digits in the string and replace them with an empty string
    new_string = re.sub(pattern, ' ', text)

    return new_string

In [4]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r' ', text)

In [5]:
def remove_lineBreak(text):
    """
        Remove line break
    """
    return re.sub("\n"," ",text)

In [6]:
def lowerCase(text):
    """
        Transform all the text to lower case
    """
    return text.lower()

In [7]:
def remove_extra_whitespaces_func(text):
    '''
    Removes extra whitespaces from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without extra whitespaces
    ''' 
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

In [8]:
def remove_uu(text):
    """
        Remove the uu  
    """
    return re.sub(r'u u',' ', text)

In [9]:
def find_title(text):
    '''
    Find the title presented in the text with the regex pattern (Exp: 5.1 Plan scope mangement)
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        the title found in the text 
    '''
    return re.findall("[0-9].*[A-Z]\n",text)

In [10]:
def get_section(text):
    '''
    Get the sections in a text (Exp: section 4.1.2)
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        the list of the section founded in the paragraph 
    '''
    return re.findall('Section\s[0-9].{1,7}',text)

In [11]:
def get_figure(text):
    '''
    Get the figures in a text (Exp: section 4.1.2)
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        the list of the figures founded in the paragraph 
    '''
    return re.findall("Figure\s[0-9]-[0-9]{1,2}.\s[A-Z]{1}.*",text)

In [12]:
def get_def(text):
    if re.findall('^Described in Section',text)  :
        return ''
    else : 
        return re.sub('The inputs, tools and techniques, and outputs of this process are depicted in (.*) [0-9]{3}',' ',text)
    #return text

In [13]:
def get_clean_figure(text):
    return re.sub('(Figure\s\d-\d{1,2}.\sFigure\s[0-9].*)', ' ', text) 

In [14]:
def get_process_of_concept(df):
    '''
    Get the topic of each concept  (Exp: concept = PROJECT CHARTER -- topic = PLAN SCOPE MANAGEMENT )
    
    Args:
        df (str): the concept column to which the function is to be applied, string
    
    Returns:
        the topic of the concept passed 
    '''
    index = []
    for i in range(0,len(df['content'])) :
        if df['content'][i] == ' ':
            index.append(i)
    df['topic'] = ''
    for j in range(0,len(index)):
        if j == len(index)-1:
            df['topic'][index[j]:] = df['concept'][index[j]]
            break;
        else :
            df['topic'][index[j]:index[j+1]] = df['concept'][index[j]]
    return df['topic']

In [15]:
def create_columns_from_each_chapter(text,title,content,figures):
    '''
    Creation of the dataframe from the text passed
    
    Args:
        text (str): the text from the pmbok that will be preprocced, string
        title (list): the list that will contain the title 
        content (str): the str that will contain the content
        figures (list): the list that contain the figures
    Returns:
        A dataframe with the following columns : concept, topic, figure, section, type,content 
    '''
    dic = {}
    title = find_title(text)
    for i in range(0,len(title)):
        value = re.sub('\n',' ',title[i])
        title[i] = value.strip()
    
    text = remove_lineBreak(text)
    
    for i in range(0,len(title)):
        if i == (len(title)-1) : 
            content.append(re.findall(title[-1]+'(.*)'+'endchapter',text))
            break;
        else:
            content.append(re.findall(title[i]+'(.*)'+title[i+1],text))
    for i in range(0,len(content)):
        dic[title[i]] = content[i][0]
        
    df = pd.DataFrame(list(dic.items()),columns=['concept','content'])
    
    df['section'] = df['content'].apply(lambda x : get_section(x))
    df['figure'] = None
    for i in range(0,len(figures)):
        df['figure'][i] = figures[i] #df['definition'].apply(lambda x : get_figure(x))
    df['type'] = get_type(df)
    
    
    
    return df #title,content

In [16]:
def get_type(df):
    
    '''
    Get the type of each concept  (Exp: concept = PROJECT CHARTER -- type =  inputs of PLAN SCOPE MANAGEMENT )
    
    Args:
        df (str): the concept column to which the function is to be applied, string
    
    Returns:
        the type of each concept passed 
    '''
    
    start_input = []
    start_tools_and_techniques = []
    start_output = []
    
    title_list = df['concept'].apply(lambda x : lowerCase(x))
    for i in range(0,len(title_list)):
        if 'inputs' in title_list[i] :
            start_input.append(i)
        if 'tools and techniques' in title_list[i] :
            start_tools_and_techniques.append(i)
        if 'outputs' in title_list[i] :
            start_output.append(i)
    for i in range(0,len(start_input)) :
        if i == len(start_input)-1 :
            df.loc[start_input[-1]:start_tools_and_techniques[-1],'type'] = 'inputs'
            df.loc[start_tools_and_techniques[-1]:start_output[-1],'type'] = 'tools_and_techniques'
            df.loc[start_output[-1]:,'type'] = 'outputs'
            break
        df.loc[start_input[i]:start_tools_and_techniques[i],'type'] = 'inputs'
        df.loc[start_tools_and_techniques[i]:start_output[i],'type'] = 'tools_and_techniques'
        df.loc[start_output[i]:start_input[i+1],'type'] = 'outputs'
    return df['type']

In [17]:
def clean_topic(text):
    '''
    remove the unwanted word from the topics
    
    Args:
        text (str): the concept column to which the function is to be applied, string
    
    Returns:
        the cleaned text 
    '''
    if re.findall(': INPUTS',text)  :
        return re.sub(': INPUTS',' ',text)
    elif re.findall(': OUTPUTS',text) :
        return re.sub(': OUTPUTS',' ',text)
    elif re.findall(': TOOLS AND TECHNIQUES',text) :
        return re.sub(': TOOLS AND TECHNIQUES',' ',text)
    else :
        return text

In [18]:
def clean_dataFrame(df,columns):
    
    
    for col in columns:
        df[col] = df[col].apply(lambda x : remove_punctuation(x))
        df[col] = df[col].apply(lambda x : remove_non_ascii(x))
        df[col] = df[col].apply(lambda x : remove_number(x))
        df[col] = df[col].apply(lambda x : remove_extra_whitespaces_func(x))
        df[col] = df[col].apply(lambda x : lowerCase(x))
    
    #df = df.apply(lambda x : remove_point(x))
    return df 

# Import Data From The PDF File

## 1. Retrive The Text From The PMBOK

In [19]:
# Get The text from the pdf file with the library fitz
fdoc = fitz.open("PMBOK6-2017.pdf")
header = "Header"  # text in header
footer = "Page %i of %i"  # text in footer
page = []
for i  in range(0,573):
    page.append(fdoc[i].get_text())  # insert header

## 2. Get the text for each chapter

### 2.1 Initialize Variables

In [20]:
# variable represent each chapter that we will work with 
scope = []
schedule = []
cost = []

# variable represent title that we get from each chapter and the variable with contient the whole text that we will work with 
scope_title = []
scope_title_without_number = []
scope_text = ' '
scope_content = []

schedule_title = []
schedule_title_without_number = []
schedule_text = ' '
schedule_content = []

cost_title = []
cost_title_without_number = []
cost_text = ' '
cost_content = []

figures_scope = []
figures_schedule = []
figures_cost = []

### 2.2 Project scope management

In [21]:
for i in range(164,207):
    p = re.sub('Not For Distribution, Sale or Reproduction.', ' ', page[i])
    p = re.sub('Part 1 - Guide', ' ', p)
    scope.append(p)
    scope_title.append(re.findall("[0-9].*[A-Z]\n",p))
    figures_scope.append(re.findall("Figure\s[0-9]-[0-9]{1,2}.\s[A-Z]{1}.*",p))

In [22]:
for i in range(0,len(scope)):
    scope_text = scope_text + scope[i]
scope_text = scope_text + ' endchapter'

In [23]:
df_scope = create_columns_from_each_chapter(scope_text,scope_title,scope_content,figures_scope)
df_scope['topic'] = ''
df_scope['topic'] = get_process_of_concept(df_scope)
df_scope['content'] = df_scope['content'].apply(lambda x : remove_extra_whitespaces_func(x))
df_scope['def'] = df_scope['content'].apply(lambda x : get_def(x))
df_scope['ref'] = df_scope['section'] + df_scope['figure']


In [24]:
df_scope.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,5.4 Create WBS,—The process of subdividing project deliverabl...,[],[],,,—The process of subdividing project deliverabl...,[]
1,5.1 PLAN SCOPE MANAGEMENT,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2...",[Figure 5-1. Project Scope Management Overview],,,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2..."
2,5.1.1 PLAN SCOPE MANAGEMENT: INPUTS,,[],[],inputs,5.1.1 PLAN SCOPE MANAGEMENT: INPUTS,,[]
3,5.1.1.1 PROJECT CHARTER,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,5.1.1 PLAN SCOPE MANAGEMENT: INPUTS,,[Section 4.1.3.1.]
4,5.1.1.2 PROJECT MANAGEMENT PLAN,Described in Section 4.2.3.1. Project manageme...,"[Section 4.2.3.1., Section 8.1.3.1.]",[],inputs,5.1.1 PLAN SCOPE MANAGEMENT: INPUTS,,"[Section 4.2.3.1., Section 8.1.3.1.]"


In [25]:
columns_to_clean = ['concept','topic']

df_scope['topic'] = df_scope['topic'].apply(lambda x : clean_topic(x))
df_scope['def'] = df_scope['def'].apply(lambda x : remove_non_ascii(x))

df_scope = clean_dataFrame(df_scope,columns_to_clean)

In [26]:
df_scope.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,create wbs,—The process of subdividing project deliverabl...,[],[],,,The process of subdividing project deliverabl...,[]
1,plan scope management,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2...",[Figure 5-1. Project Scope Management Overview],,,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2..."
2,plan scope management inputs,,[],[],inputs,plan scope management,,[]
3,project charter,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,plan scope management,,[Section 4.1.3.1.]
4,project management plan,Described in Section 4.2.3.1. Project manageme...,"[Section 4.2.3.1., Section 8.1.3.1.]",[],inputs,plan scope management,,"[Section 4.2.3.1., Section 8.1.3.1.]"


In [27]:
df_scope['topic'].unique()

array(['', 'plan scope management', 'collect requirements',
       'define scope', 'create wbs', 'validate scope', 'control scope'],
      dtype=object)

### 2.3 Project schedule management

In [28]:
for i in range(208,265):
    p = re.sub('Not For Distribution, Sale or Reproduction.', ' ', page[i])
    p = re.sub('Part 1 - Guide', ' ', p) 
    schedule.append(p)
    schedule_title.append(re.findall("[0-9].*[A-Z]\n",p))
    figures_schedule.append(re.findall("Figure\s[0-9]-[0-9]{1,2}.\s[A-Z]{1}.*",p))

In [29]:
for i in range(0,len(schedule)):
    schedule_text = schedule_text + schedule[i]
schedule_text = schedule_text + ' endchapter'

In [30]:
df_schedule = create_columns_from_each_chapter(schedule_text,schedule_title,schedule_content,figures_schedule)
df_schedule['topic'] = ''
df_schedule['topic'] = get_process_of_concept(df_schedule)
df_schedule['content'] = df_schedule['content'].apply(lambda x : remove_extra_whitespaces_func(x))
df_schedule['def'] = df_schedule['content'].apply(lambda x : get_def(x))
df_schedule['ref'] = df_schedule['section'] + df_schedule['figure']


In [31]:
df_schedule.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,6.1 PLAN SCHEDULE MANAGEMENT,Plan Schedule Management is the process of est...,[],[],,,Plan Schedule Management is the process of est...,[]
1,6.1.1 PLAN SCHEDULE MANAGEMENT: INPUTS,,[],[Figure 6-1. Project Schedule Management Overv...,inputs,6.1.1 PLAN SCHEDULE MANAGEMENT: INPUTS,,[Figure 6-1. Project Schedule Management Overv...
2,6.1.1.1 PROJECT CHARTER,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,6.1.1 PLAN SCHEDULE MANAGEMENT: INPUTS,,[Section 4.1.3.1.]
3,6.1.1.2 PROJECT MANAGEMENT PLAN,Described in Section 4.3.2.1. Project manageme...,"[Section 4.3.2.1., Section 5.1.3.1., Section 4...",[Figure 6-2. Scheduling Overview],inputs,6.1.1 PLAN SCHEDULE MANAGEMENT: INPUTS,,"[Section 4.3.2.1., Section 5.1.3.1., Section 4..."
4,6.1.1.3 ENTERPRISE ENVIRONMENTAL FACTORS,The enterprise environmental factors that can ...,[],[],inputs,6.1.1 PLAN SCHEDULE MANAGEMENT: INPUTS,The enterprise environmental factors that can ...,[]


In [32]:
columns_to_clean = ['concept','topic']

df_schedule['topic'] = df_schedule['topic'].apply(lambda x : clean_topic(x))
df_schedule['def'] = df_schedule['def'].apply(lambda x : remove_non_ascii(x))

df_schedule = clean_dataFrame(df_schedule,columns_to_clean)


In [33]:
df_schedule.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,plan schedule management,Plan Schedule Management is the process of est...,[],[],,,Plan Schedule Management is the process of est...,[]
1,plan schedule management inputs,,[],[Figure 6-1. Project Schedule Management Overv...,inputs,plan schedule management,,[Figure 6-1. Project Schedule Management Overv...
2,project charter,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,plan schedule management,,[Section 4.1.3.1.]
3,project management plan,Described in Section 4.3.2.1. Project manageme...,"[Section 4.3.2.1., Section 5.1.3.1., Section 4...",[Figure 6-2. Scheduling Overview],inputs,plan schedule management,,"[Section 4.3.2.1., Section 5.1.3.1., Section 4..."
4,enterprise environmental factors,The enterprise environmental factors that can ...,[],[],inputs,plan schedule management,The enterprise environmental factors that can ...,[]


In [34]:
df_schedule['topic'].unique()

array(['', 'plan schedule management', 'define activities',
       'sequence activities', 'estimate activity durations',
       'develop schedule', 'd', 'b', 'g', 'p', 'control schedule'],
      dtype=object)

In [35]:
index_p = df_schedule.index[df_schedule['topic']== 'p'].tolist()
index_b = df_schedule.index[df_schedule['topic']== 'b'].tolist()
index_d = df_schedule.index[df_schedule['topic']== 'd'].tolist()
index_g = df_schedule.index[df_schedule['topic']== 'g'].tolist()
index = index_p + index_d + index_b + index_g
index

[88, 89, 90, 91, 92, 93, 94, 95, 81, 84, 85, 82, 83, 86, 87]

In [36]:
df_schedule = df_schedule.drop(index = index, axis=1)
df_schedule.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,plan schedule management,Plan Schedule Management is the process of est...,[],[],,,Plan Schedule Management is the process of est...,[]
1,plan schedule management inputs,,[],[Figure 6-1. Project Schedule Management Overv...,inputs,plan schedule management,,[Figure 6-1. Project Schedule Management Overv...
2,project charter,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,plan schedule management,,[Section 4.1.3.1.]
3,project management plan,Described in Section 4.3.2.1. Project manageme...,"[Section 4.3.2.1., Section 5.1.3.1., Section 4...",[Figure 6-2. Scheduling Overview],inputs,plan schedule management,,"[Section 4.3.2.1., Section 5.1.3.1., Section 4..."
4,enterprise environmental factors,The enterprise environmental factors that can ...,[],[],inputs,plan schedule management,The enterprise environmental factors that can ...,[]


In [37]:
df_schedule['topic'].unique()

array(['', 'plan schedule management', 'define activities',
       'sequence activities', 'estimate activity durations',
       'develop schedule', 'control schedule'], dtype=object)

### 2.4 Project cost management

In [38]:
for i in range(266,306):
    p = re.sub('Not For Distribution, Sale or Reproduction.', ' ', page[i])
    p = re.sub('Part 1 - Guide', ' ', p)
    cost.append(p)
    cost_title.append(re.findall("[0-9].*[A-Z]\n",p))
    figures_cost.append(re.findall("Figure\s[0-9]-[0-9]{1,2}.\s[A-Z]{1}.*",p))

In [39]:
for i in range(0,len(cost)):
    cost_text = cost_text + cost[i]
cost_text = cost_text + ' endchapter'

In [40]:
df_cost = create_columns_from_each_chapter(cost_text,cost_title,cost_content,figures_cost)
df_cost['topic'] = ''
df_cost['topic'] = get_process_of_concept(df_cost)
df_cost['content'] = df_cost['content'].apply(lambda x : remove_extra_whitespaces_func(x))
df_cost['def'] = df_cost['content'].apply(lambda x : get_def(x))
df_cost['ref'] = df_cost['section'] + df_cost['figure']


In [41]:
df_cost.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,7.1 PLAN COST MANAGEMENT,Plan Cost Management is the process of deﬁning...,[],[],,,Plan Cost Management is the process of deﬁning...,[]
1,7.1.1 PLAN COST MANAGEMENT: INPUTS,,[],[Figure 7-1. Project Cost Management Overview],inputs,7.1.1 PLAN COST MANAGEMENT: INPUTS,,[Figure 7-1. Project Cost Management Overview]
2,7.1.1.1 PROJECT CHARTER,Described in Section 4.2.3.1. The project char...,[Section 4.2.3.1.],[],inputs,7.1.1 PLAN COST MANAGEMENT: INPUTS,,[Section 4.2.3.1.]
3,7.1.1.2 PROJECT MANAGEMENT PLAN,Described in Section 4.2.3.1. Project manageme...,"[Section 4.2.3.1., Section 6.1.3.1., Section 1...",[],inputs,7.1.1 PLAN COST MANAGEMENT: INPUTS,,"[Section 4.2.3.1., Section 6.1.3.1., Section 1..."
4,7.1.1.3 ENTERPRISE ENVIRONMENTAL FACTORS,The enterprise environmental factors that can ...,[],"[Figure 7-2. Figure 7-3 depicts the data ﬂow ,...",inputs,7.1.1 PLAN COST MANAGEMENT: INPUTS,The enterprise environmental factors that can ...,"[Figure 7-2. Figure 7-3 depicts the data ﬂow ,..."


In [42]:
columns_to_clean = ['concept','topic']

df_cost['topic'] = df_cost['topic'].apply(lambda x : clean_topic(x))
df_cost['def'] = df_cost['def'].apply(lambda x : remove_non_ascii(x))

df_cost = clean_dataFrame(df_cost,columns_to_clean)


In [43]:
df_cost.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,plan cost management,Plan Cost Management is the process of deﬁning...,[],[],,,Plan Cost Management is the process of de ning...,[]
1,plan cost management inputs,,[],[Figure 7-1. Project Cost Management Overview],inputs,plan cost management,,[Figure 7-1. Project Cost Management Overview]
2,project charter,Described in Section 4.2.3.1. The project char...,[Section 4.2.3.1.],[],inputs,plan cost management,,[Section 4.2.3.1.]
3,project management plan,Described in Section 4.2.3.1. Project manageme...,"[Section 4.2.3.1., Section 6.1.3.1., Section 1...",[],inputs,plan cost management,,"[Section 4.2.3.1., Section 6.1.3.1., Section 1..."
4,enterprise environmental factors,The enterprise environmental factors that can ...,[],"[Figure 7-2. Figure 7-3 depicts the data ﬂow ,...",inputs,plan cost management,The enterprise environmental factors that can ...,"[Figure 7-2. Figure 7-3 depicts the data ﬂow ,..."


In [44]:
df_cost['topic'].unique()

array(['', 'plan cost management', 'estimate costs', 'determine budget',
       'control costs'], dtype=object)

### 2.5 Merge All three DataFrame:

In [45]:
final_df = df_scope.append(df_schedule)
final_df = final_df.append(df_cost)
final_df = final_df.drop(index=0,axis=1)
final_df = final_df.reset_index(drop=True)

In [46]:
final_df.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,plan scope management,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2...",[Figure 5-1. Project Scope Management Overview],,,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2..."
1,plan scope management inputs,,[],[],inputs,plan scope management,,[]
2,project charter,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,plan scope management,,[Section 4.1.3.1.]
3,project management plan,Described in Section 4.2.3.1. Project manageme...,"[Section 4.2.3.1., Section 8.1.3.1.]",[],inputs,plan scope management,,"[Section 4.2.3.1., Section 8.1.3.1.]"
4,enterprise environmental factors,The enterprise environmental factors that can ...,[],"[Figure 5-2. Figure 5-3 depicts , Figure 5-2. ...",inputs,plan scope management,The enterprise environmental factors that can ...,"[Figure 5-2. Figure 5-3 depicts , Figure 5-2. ..."


In [47]:
final_df = final_df.fillna(value='')
final_df

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,plan scope management,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2...",[Figure 5-1. Project Scope Management Overview],,,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2..."
1,plan scope management inputs,,[],[],inputs,plan scope management,,[]
2,project charter,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,plan scope management,,[Section 4.1.3.1.]
3,project management plan,Described in Section 4.2.3.1. Project manageme...,"[Section 4.2.3.1., Section 8.1.3.1.]",[],inputs,plan scope management,,"[Section 4.2.3.1., Section 8.1.3.1.]"
4,enterprise environmental factors,The enterprise environmental factors that can ...,[],"[Figure 5-2. Figure 5-3 depicts , Figure 5-2. ...",inputs,plan scope management,The enterprise environmental factors that can ...,"[Figure 5-2. Figure 5-3 depicts , Figure 5-2. ..."
...,...,...,...,...,...,...,...,...
245,work performance information,Described in Section 4.5.1.3. Work performance...,"[Section 4.5.1.3., Section 4.5.3.1)]",,outputs,control costs,,
246,cost forecasts,Either a calculated EAC value or a bottom-up E...,[],,outputs,control costs,Either a calculated EAC value or a bottom-up E...,
247,change requests,Described in Section 4.3.3.4. Analysis of proj...,"[Section 4.3.3.4., Section 4.6). ]",,outputs,control costs,,
248,project management plan updates,Any change to the project management plan goes...,"[Section 7.1.3.1., Section 7.3.3.1., Section 4...",,outputs,control costs,Any change to the project management plan goes...,


In [48]:
final_df['topic'].unique()

array(['', 'plan scope management', 'collect requirements',
       'define scope', 'create wbs', 'validate scope', 'control scope',
       'plan schedule management', 'define activities',
       'sequence activities', 'estimate activity durations',
       'develop schedule', 'control schedule', 'plan cost management',
       'estimate costs', 'determine budget', 'control costs'],
      dtype=object)

In [49]:
final_df['concept'].unique()

array(['plan scope management', 'plan scope management inputs',
       'project charter', 'project management plan',
       'enterprise environmental factors',
       'organizational process assets',
       'plan scope management tools and techniques', 'expert judgment',
       'data analysis', 'meetings', 'plan scope management outputs',
       'scope management plan', 'requirements management plan',
       'collect requirements', 'collect requirements inputs',
       'project documents', 'business documents', 'agreements',
       'collect requirements tools and techniques', 'data gathering',
       'decision making', 'data representation',
       'interpersonal and team skills', 'context diagram', 'prototypes',
       'collect requirements outputs', 'requirements documentation',
       'requirements traceability matrix', 'define scope',
       'define scope inputs', 'define scope tools and techniques',
       'product analysis', 'define scope outputs',
       'project scope statement

In [50]:
index = final_df.index[final_df['content'] == ''].tolist()
final_df = final_df.drop(index = index, axis=1)

In [51]:
final_df['concept'].unique()

array(['plan scope management', 'project charter',
       'project management plan', 'enterprise environmental factors',
       'organizational process assets', 'expert judgment',
       'data analysis', 'meetings', 'scope management plan',
       'requirements management plan', 'collect requirements',
       'project documents', 'business documents', 'agreements',
       'data gathering', 'decision making', 'data representation',
       'interpersonal and team skills', 'context diagram', 'prototypes',
       'requirements documentation', 'requirements traceability matrix',
       'define scope', 'product analysis', 'project scope statement',
       'project documents updates', 'create wbs', 'decomposition',
       'scope baseline', 'validate scope', 'verified deliverables',
       'work performance data', 'inspection', 'accepted deliverables',
       'work performance information', 'change requests', 'control scope',
       'project management plan updates', 'schedule management plan'

In [52]:
final_df.head()

Unnamed: 0,concept,content,section,figure,type,topic,def,ref
0,plan scope management,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2...",[Figure 5-1. Project Scope Management Overview],,,Plan Scope Management is the process of creati...,"[Section 4.2.3.1), Section 2.3), an, Section 2..."
2,project charter,Described in Section 4.1.3.1. The project char...,[Section 4.1.3.1.],[],inputs,plan scope management,,[Section 4.1.3.1.]
3,project management plan,Described in Section 4.2.3.1. Project manageme...,"[Section 4.2.3.1., Section 8.1.3.1.]",[],inputs,plan scope management,,"[Section 4.2.3.1., Section 8.1.3.1.]"
4,enterprise environmental factors,The enterprise environmental factors that can ...,[],"[Figure 5-2. Figure 5-3 depicts , Figure 5-2. ...",inputs,plan scope management,The enterprise environmental factors that can ...,"[Figure 5-2. Figure 5-3 depicts , Figure 5-2. ..."
5,organizational process assets,The organizational process assets that can inﬂ...,[],[],inputs,plan scope management,The organizational process assets that can in ...,[]


In [53]:
final_df['content'][0]

'Plan Scope Management is the process of creating a scope management plan that documents how the project and product scope will be deﬁned, validated, and controlled. The key beneﬁt of this process is that it provides guidance and direction on how scope will be managed throughout the project. This process is performed once or at predeﬁned points in the project. The inputs, tools and techniques, and outputs of this process are depicted in Figure 5-2. Figure 5-3 depicts the data ﬂow diagram of the process. Figure 5-2. Plan Scope Management: Inputs, Tools & Techniques, and Outputs Figure 5-3. Plan Scope Management: Data Flow Diagram Tools & Techniques Inputs Outputs Plan Scope Management .1 Expert judgment .2 Data analysis • Alternatives analysis .3 Meetings .1 Project charter .2 Project management plan • Quality management plan • Project life cycle description • Development approach .3 Enterprise environmental factors .4 Organizational process assets .1 Scope management plan .2 Requiremen

# Text Preprocessing

## 1. Prepare the data ( Removing stop words, lowercasing ,etc....)

In [54]:
data_to_process = final_df[['concept','content']]
data_to_process['content'] = data_to_process['content'].apply(lambda x : re.sub('include but are not limited to','include',x))
data_to_process.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_process['content'] = data_to_process['content'].apply(lambda x : re.sub('include but are not limited to','include',x))


Unnamed: 0,concept,content
0,plan scope management,Plan Scope Management is the process of creati...
2,project charter,Described in Section 4.1.3.1. The project char...
3,project management plan,Described in Section 4.2.3.1. Project manageme...
4,enterprise environmental factors,The enterprise environmental factors that can ...
5,organizational process assets,The organizational process assets that can inﬂ...


In [55]:
# remove bias u u 
data_to_process['content'] = data_to_process['content'].apply(lambda x : remove_uu(x))
# lower cassing 
data_to_process['content'] = data_to_process['content'].apply(lambda x : lowerCase(x))
# remove extra white space 
data_to_process['content'] = data_to_process['content'].apply(lambda x : remove_extra_whitespaces_func(x))
# remove line break 
data_to_process['content'] = data_to_process['content'].apply(lambda x : remove_lineBreak(x))
# remove non ascii text
data_to_process['content'] = data_to_process['content'].apply(lambda x : remove_non_ascii(x))

data_to_process['content'] = data_to_process['content'].apply(lambda x : re.sub('that can in uence','influence',x))
data_to_process['content'] = data_to_process['content'].apply(lambda x : re.sub('scope','Scope',x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_process['content'] = data_to_process['content'].apply(lambda x : remove_uu(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_process['content'] = data_to_process['content'].apply(lambda x : lowerCase(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_process['content'] = da

## 2. Extract each sentence from content

In [56]:
# intialize nlp
nlp = spacy.load("en_core_web_sm")

In [57]:
def extract_sentence(text):
    sentence = []
    doc = nlp(text)
    for sent in doc.sents :
        sentence.append(sent.text)
    return sentence

In [58]:
def lemm(text):
    text = nlp(text)
    l = []
    for word in text:
        lemma = word.lemma_
        l.append(lemma)
    return l

In [59]:
data_to_process['lemma_content'] = data_to_process['content'].apply(lambda x : ' '.join(lemm(x)))
data_to_process['lemma_sentence'] = data_to_process['lemma_content'].apply(lambda x : extract_sentence(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_process['lemma_content'] = data_to_process['content'].apply(lambda x : ' '.join(lemm(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_process['lemma_sentence'] = data_to_process['lemma_content'].apply(lambda x : extract_sentence(x))


In [60]:
data_to_process.head()

Unnamed: 0,concept,content,lemma_content,lemma_sentence
0,plan scope management,plan Scope management is the process of creati...,plan Scope management be the process of create...,[plan Scope management be the process of creat...
2,project charter,described in section 4.1.3.1. the project char...,describe in section 4.1.3.1 . the project char...,"[describe in section 4.1.3.1 ., the project ch..."
3,project management plan,described in section 4.2.3.1. project manageme...,describe in section 4.2.3.1 . project manageme...,"[describe in section 4.2.3.1 ., project manage..."
4,enterprise environmental factors,the enterprise environmental factors influence...,the enterprise environmental factor influence ...,[the enterprise environmental factor influence...
5,organizational process assets,the organizational process assets influence th...,the organizational process asset influence the...,[the organizational process asset influence th...


In [61]:
data_to_process['lemma_sentence'][2]

['describe in section 4.1.3.1 .',
 'the project charter document the project purpose , high - level project description , assumption , constraint , and high - level requirement that the project be intend to satisfy .']

In [62]:
cleared_sentence = remove_punctuation(data_to_process['lemma_sentence'][2][1])
cleared_sentence = remove_extra_whitespaces_func(cleared_sentence)
cleared_sentence

'the project charter document the project purpose , high level project description , assumption , constraint , and high level requirement that the project be intend to satisfy'

In [63]:
'''
the following funtion represent the chunking model that will be used for the 
relation extraction between the concept
'''

def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff
#Define the chunker class
class NGramTagChunker(ChunkParserI):
    def __init__(self,train_sentences,tagger_classes=[UnigramTagger,BigramTagger]):
        train_sent_tags=conll_tag_chunks(train_sentences)
        self.chunk_tagger=combined_tagger(train_sent_tags,tagger_classes)
    def parse(self,tagged_sentence):
        if not tagged_sentence:
            return None
        pos_tags=[tag for word, tag in tagged_sentence]
        chunk_pos_tags=self.chunk_tagger.tag(pos_tags)
        chunk_tags=[chunk_tag for (pos_tag,chunk_tag) in chunk_pos_tags]
        wpc_tags=[(word,pos_tag,chunk_tag) for ((word,pos_tag),chunk_tag) in zip(tagged_sentence,chunk_tags)]
        return conlltags2tree(wpc_tags)

In [64]:
#Here conll2000 corpus for training shallow parser model
data= conll2000.chunked_sents()
train_data=data[:10900]
test_data=data[10900:]

from nltk.chunk.util import tree2conlltags,conlltags2tree
wtc=tree2conlltags(train_data[1])
wtc
tree=conlltags2tree(wtc)

#train chunker model
ntc=NGramTagChunker(train_data)
#evaluate chunker model performance
print(ntc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  90.0%%
    Precision:     82.1%%
    Recall:        86.3%%
    F-Measure:     84.1%%


In [65]:
t = tuple()
def get_the_rigth_chunking(chunk_tree):
    try :
        begin = len(chunk_tree)-1
        for i in range(begin,0,-1):
            if type(chunk_tree[i]) == type(t):
                chunk_tree.pop(i)
    except IndexError:
        return chunk_tree
            
    return chunk_tree

In [66]:
def postag(text):
    #text = nlp(text)
    l = []
    for word, tag in pos_tag(text):
        if not tag in ['DT','IN','RB','CC','WRB','RB','WDT'] : 
            l.append(word)
    return l

In [153]:
t = tuple()
NP=[]
VB=[]  
def get_relation_from_definition(text):

    nltk_pos_tagged = nltk.pos_tag(text.split())
    chunk_tree = ntc.parse(nltk_pos_tagged)
    chunk_tree = get_the_rigth_chunking(chunk_tree)
    for k in range(len(chunk_tree)):
        chunk = chunk_tree[k]
        #print(chunk)
        if chunk == 'NP':
            print('test')
            noun = ' '.join(word for word, pos in chunk.leaves())
            NP.append('asba')
        elif chunk == 'VP':
            verb = ' '.join(word for word, pos in chunk.leaves())
            VB.append(verb)
    return NP,VB

In [154]:
cleared_sentence

'the project charter document the project purpose , high level project description , assumption , constraint , and high level requirement that the project be intend to satisfy'

In [155]:
NP,VB = get_relation_from_definition(cleared_sentence)

(NP the/DT project/NN charter/NN)
(VP document/VBD)
(NP the/DT project/NN purpose/NN)
(NP high/JJ level/NN project/NN description/NN)
(NP assumption/NN)
(NP constraint/NN)
(NP high/JJ level/NN requirement/NN)
(PP that/IN)
(NP the/DT project/NN)
(VP be/VB intend/VBN to/TO satisfy/VB)


In [156]:
NP,VB

([], [])

# Final DataFrame

# OWL File Creation

# Model Creation 