In [503]:
#IMPORT ALL LIBRARIES

import spacy 
import pandas as pd
import textacy
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import re
import nltk
import wordninja
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import os

In [550]:
#LIST OF VARIABLES THAT CAN BE MODIFIED

spacy_pipeline = 'en_core_web_sm' # visit https://spacy.io/models to checkout other options
raw_text_file = r'AT&T (Analysis)-Cleaned data.csv' # replace with file for which knowledge graph needs to be created
description_column = 'Description' # the column from which the semantic triplets need to be identified
number_of_semantic_triplets = 3900 # number of rows for which semantic triplets need to be created
typedb_database = 'test' #DB to connect to
number_of_triplets_to_load = 10 # number of semantic triplets to load into typedb database

In [551]:
#LOAD THE SPACY PIPELINE
nlp = spacy.load(spacy_pipeline)
from spacy.matcher import Matcher 


In [552]:

df = pd.read_csv(raw_text_file, encoding= 'latin1')
df = df.dropna(axis=0, subset=[description_column]) # drop null values


In [553]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and   
    ##characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    # back to string from list
    text = " ".join(lst_text)
    return text



In [554]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [555]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords.extend(["ipmon", "alert", "description", "problem", "attribute", "please", "follow up", "reach", "me", "via", "thank", "help", "affect"])

texts = df[description_column].apply(lambda x:
        utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
        lst_stopwords=lst_stopwords))
new_texts = []
for text in texts:
        list1 = list(text)
        for i in range(0, len(list1)):
                if list1[i] == '_':
                        list1[i] = ' '
        text = ''.join(list1)
        # text = ' '.join(wordninja.split(text))
        new_texts.append(text)

texts = new_texts
print(texts[0:5]) # show sample of text loaded into dataframe

['asset a1111111 affected end user empl id e000000 affected end user name teresa sachs affected end user telephone 8586948756 affected end user low org 46462 street city 3989 ruffin road floor 2 room 2057 street city 3989 ruffin road floor 2 room 2057 item change display name itrack owner 8586948756 teresa sachs', 'affected end user empl id e000000 affected end user name lt patrick shannon affected end user telephone 7607514401 affected end user low org 39550 street city 28201 n lake wohlford rd floor 1 room 1 item display update', 'update caller id 6195313764 caller id read tamara clark change tamra clark', 'asset hps23711 affected end user empl id e042134 affected end user name lopezalba affected end user telephone 6197445141 affected end user low org 46442 street city 1130 10th st floor 0 room 0 street city 1130 10th st floor 0 room 0 item change caller id ownership phone', 'primary point contact information name marie osgood email marieosgoodsdcountycagov add detail general add tot

In [556]:
# object and subject extraction

def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    if(has_numbers(str(tok))):
      continue


    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [557]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

    


In [558]:
# tf -idf approach
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)

cv=CountVectorizer(max_df=0.85)
word_count_vector=cv.fit_transform(texts)
feature_names=cv.get_feature_names()

tfidf_transformer.fit(word_count_vector)
tf_idf_keywords = []
entity_pairs = []
for i in tqdm(range(0, number_of_semantic_triplets)):
    tf_idf_vector=tfidf_transformer.transform(cv.transform([texts[i]]))
    sorted_items= sort_coo(tf_idf_vector.tocoo())
    #extract only the top n; n here is 30
    keywords=extract_topn_from_vector(feature_names,sorted_items,30)
    lst_text = [word for word in keywords]
    text_ = " ".join(lst_text)
    tf_idf_keywords.append(text_)
    entity_pairs.append(get_entities(text_))
    
print(tf_idf_keywords[:5])

100%|██████████| 3900/3900 [00:28<00:00, 135.18it/s]

['8586948756 sachs 2057 teresa 3989 ruffin road affected end a1111111 46462 city street room itrack floor name owner e000000 empl item low org telephone change display id', 'affected end wohlford 7607514401 39550 28201 lake shannon patrick lt rd e000000 empl item city street low org room floor telephone update name display id', 'clark tamra 6195313764 tamara caller id read update change', '1130 10th affected end lopezalba hps23711 e042134 6197445141 st 46442 city street room floor ownership id empl caller item low org telephone change name phone', '7607403668 affected end wmission porterdenise e069446 marieosgoodsdcountycagov osgood item porter marie 46065 denise 649 add telephone name id ave none 000 cost total feature empl detail general point type caller']





In [521]:
# get_entities(text)
# entity_pairs = []

# for i in tqdm(texts[0:number_of_semantic_triplets]):
#   entity_pairs.append(get_entities(i))

In [559]:
# relation or predicate extraction
def get_relation(sent):

  doc = nlp(sent)
  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern]) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [560]:
# get_relation(text)
relations = [get_relation(i) for i in tqdm(texts[0:number_of_semantic_triplets])]

100%|██████████| 3900/3900 [01:11<00:00, 54.65it/s] 


In [563]:
# extract subject

source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'text': texts[0:number_of_semantic_triplets],'source':source, 'target':target, 'edge':relations})
kg_new = kg_df
kg_original = kg_df
labels = kg_original ['edge'] + ' ' + kg_original ['source'] + ' ' + kg_original ['target']
kg_original['labels'] = labels
kg_new = kg_new.groupby(['source']).size().reset_index(name = 'subject_count')
kg_new = kg_new.sort_values(by = ['subject_count'], ascending=False)
kg_new.to_csv('at&t_top_labels_tfidf.csv')

kg_df = kg_df.groupby(['edge', 'source', 'target']).size().reset_index(name='object_count')
kg_df = kg_df.sort_values(by = ['object_count'], ascending=False)
labels = kg_df['edge'] + ' ' + kg_df['source'] + ' ' + kg_df['target']
kg_df['labels'] = labels
print(kg_original.shape)

kg_df.to_csv('at&t_new_labels.csv') 

(3900, 5)


In [564]:
# Topic Modelling Approach

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data = kg_df['labels'].apply(lambda x:
        utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
        lst_stopwords=lst_stopwords)).values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

['impacted', 'note', 'activity', 'existing', 'imar']


In [565]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts_words = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts_words]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [566]:
def convert_tuple_to_dictionary(tup, di):
    for a, b in tup:
        di.setdefault(a, []).append(b)
    return di

In [567]:
# Clean labels regex
def clean_labels(label):
    res = re.sub(r'[^a-zA-Z\+]', '', label)
    res = res.replace('+', ' ')
    return res

In [569]:
from pprint import pprint
import pickle
# number of topics
num_topics = 50

# Build LDA model - Uncomment the line below if you wish to generate a new Topic Model
# lda_model = gensim.models.LdaMulticore(corpus=corpus,
#                                        id2word=id2word,
#                                        num_topics=num_topics)
filename = 'topicModelAt.sav'
# topics = lda_model.show_topics(num_topics = 50, num_words=2, log=True, formatted=True)
# pickle.dump(lda_model, open(filename, 'wb')) # Save the model

loaded_lda_model = pickle.load(open(filename, 'rb'))
topics = loaded_lda_model.show_topics(num_topics = 50, num_words=2, log=True, formatted=True)
print(topics)
dict_topics = {}
print (convert_tuple_to_dictionary(topics, dict_topics))
# topic_dict = dict(topics)
# print(topic_dict)
doc_lda = loaded_lda_model[corpus]
# print(doc_lda)
complete_list = pd.DataFrame()
total_list = []
for i in range(0, number_of_semantic_triplets):
    data_words = tf_idf_keywords[i].split() 
    ques_vec = id2word.doc2bow(data_words)
    topic_vec = loaded_lda_model[ques_vec]
    topic_vec.sort(key=lambda i:i[1],reverse=True)
    topic_vec_mapped = []
    for topic in topic_vec:
        topic_vec_mapped.append(clean_labels(str(dict_topics.get(topic[0]))))
    print(topic_vec)
    dict = { "text": texts[i], "pred":topic_vec_mapped}
    total_list.append(dict)
pd.DataFrame(total_list).to_csv("At&t_get_topics_for_doc.csv")

[(0, '0.044*"point" + 0.027*"add"'), (1, '0.035*"affected" + 0.029*"org"'), (2, '0.047*"name" + 0.047*"org"'), (3, '0.060*"add" + 0.047*"org"'), (4, '0.042*"item" + 0.037*"note"'), (5, '0.057*"org" + 0.043*"phone"'), (6, '0.033*"org" + 0.031*"note"'), (7, '0.039*"display" + 0.035*"email"'), (8, '0.032*"org" + 0.029*"item"'), (9, '0.042*"add" + 0.036*"activity"'), (10, '0.031*"item" + 0.029*"change"'), (11, '0.047*"telephone" + 0.042*"org"'), (12, '0.042*"item" + 0.038*"org"'), (13, '0.065*"note" + 0.057*"imar"'), (14, '0.068*"information" + 0.045*"affected"'), (15, '0.038*"item" + 0.038*"email"'), (16, '0.055*"note" + 0.047*"activity"'), (17, '0.051*"note" + 0.047*"activity"'), (18, '0.035*"name" + 0.032*"activity"'), (19, '0.038*"email" + 0.029*"org"'), (20, '0.050*"affected" + 0.040*"add"'), (21, '0.046*"email" + 0.035*"affected"'), (22, '0.041*"email" + 0.038*"low"'), (23, '0.059*"phone" + 0.041*"note"'), (24, '0.076*"note" + 0.072*"activity"'), (25, '0.051*"end" + 0.039*"update"'),

In [95]:
# Create the knowledge graph

G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

In [306]:
# Plot the network
# plt.figure(figsize=(12,12))

# pos = nx.spring_layout(G)
# nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
# print(G.edges)
# plt.show()

In [254]:
# VIEW TOP 50 VERBS/PREDICATES/RELATIONS
pd.Series(relations).value_counts()[:50].to_csv("top_50_verbs.csv")

In [255]:

pd.Series(entity_pairs).value_counts()[:50].to_csv("top_50_pairs.csv")

In [2]:
#TYPEDB CLIENT THAT INERACTS WITH TYPEDB SERVER TO INPUT THE SEMANTIC TRIPLETS IDENTIFIED
from typedb.client import *

def build_knowledge_graph():
    with TypeDB.core_client("localhost:1729") as client:
        with client.session(typedb_database, SessionType.DATA) as session:
            print("Loading from python into TypeDB ...")
            load_data_into_typedb(kg_df, session)

# ...

def load_data_into_typedb(input, session):
    for i in range(0, number_of_triplets_to_load):
        subject = input["source"][i]
        object = input["target"][i]
        verb = input["edge"][i]
        with session.transaction(TransactionType.WRITE) as transaction:
            typeql_insert_query = 'insert $subject isa subject, has token "' + subject + '";'
            print("Executing TypeQL Query: " + typeql_insert_query)
            transaction.query().insert(typeql_insert_query)
            transaction.commit()
        with session.transaction(TransactionType.WRITE) as transaction:
            typeql_insert_query = 'insert $object isa object, has token "' + object + '";'
            print("Executing TypeQL Query: " + typeql_insert_query)
            transaction.query().insert(typeql_insert_query)
            transaction.commit()
        with session.transaction(TransactionType.WRITE) as transaction:
            typeql_insert_query = 'match $subject isa subject, has token "' + subject + '"; $object isa object, has token "' + object + '"; insert $verb (subject: $subject, object: $object) isa mined-relation; $verb has verb "' + verb + '";'
            print("Executing TypeQL Query: " + typeql_insert_query)
            transaction.query().insert(typeql_insert_query)
            transaction.commit()

In [68]:
build_knowledge_graph()

Loading from python into TypeDB ...
Executing TypeQL Query: insert $subject isa subject, has token "southeastasia lzseapscilogdc02:/var File system usage";
Executing TypeQL Query: insert $object isa object, has token "ScienceLogic EM7 Data Collector";
Executing TypeQL Query: match $subject isa subject, has token "southeastasia lzseapscilogdc02:/var File system usage"; $object isa object, has token "ScienceLogic EM7 Data Collector"; insert $verb (subject: $subject, object: $object) isa mined-relation; $verb has verb "exceeded critical";
Executing TypeQL Query: insert $subject isa subject, has token "Group policy update";
Executing TypeQL Query: insert $object isa object, has token "";
Executing TypeQL Query: match $subject isa subject, has token "Group policy update"; $object isa object, has token ""; insert $verb (subject: $subject, object: $object) isa mined-relation; $verb has verb "failing";
Executing TypeQL Query: insert $subject isa subject, has token "40x7597  stock";
Executing T