In [1]:
from google.colab import drive
# drive.mount('/content/gdrive')
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re
import os
import glob
import pandas as pd
import pathlib
from itertools import combinations 
from collections import defaultdict 

In [3]:
#Node structure for graph
class Node:

    def __init__(self,src,dest,wt):
        self.src = src
        self.dest = dest
        self.wt = wt


#Class to represent an un-directed graph using adjacency list representation 
class Graph: 
   
    def __init__(self,vertices): 
        self.V = vertices #No. of vertices 
        self.V_org = vertices 
        self.graph = defaultdict(list) # default dictionary to store graph 
        
        
    # function to add an edge to graph 
    def addEdge(self,u,v,w): 
        self.graph[u].append(Node(u,v,w))
        self.graph[v].append(Node(v,u,w))

        
    #function to print graph
    def printGraph(self):
        s = ""
        for i in self.graph:
            s = s + str(i) + " is connected to "
            print(str(i) + " is connected to ")
            for node in self.graph[i]:
                s = s + str(node.dest) + "(Weight = " + str(node.wt) + ")" + " "
                print(str(node.dest) + "(Weight = " + str(node.wt) + ")" + " ")
            s = s + "\n"
            print("\n")
        return s




class DataHandler:
    def __init__(self, path):
        self.type = pathlib.Path(path).suffix
        self.dataset_location = path
        
#         Use the pandas dataframe to get columns 
        if self.type == '.csv':
#             self.df = pd.read_csv(location, nrows=100)
            self.df = pd.read_csv(path)
            self.df = self.df[self.df['Domain'] == "CS "]
        elif self.type == '.xlsx':
            self.df = pd.read_excel(path)
            self.df = self.df[self.df['Domain'] == "CS "]

    def get_dataframe(self):
        return self.df

            
class keywordNodeNetGen:
    
    def __init__(self, path):
        self.path = path
        self.subsetSize = 2
        self.keysPair = []
        self.keywordsList = []
        self.dictKeywords = {}
        self.dictKeyVsPid = {}
        self.idList = []
        self.colName = []
    #     self.wbObj = openpyxl.load_workbook(self.path)
    #     self.sheetObj = self.wbObj.active    

    # #function to extract list of column headers from .xlsx
    # def extractListCol(self):
    #     maxCol = self.sheetObj.max_column    
    #     # Loop will print all columns name 
    #     for i in range(1, maxCol + 1): 
    #         cellObj = self.sheetObj.cell(row = 1, column = i) 
    #         self.colName.append(cellObj.value) 
    

    # #function to extract column index from its header
    # def extractColNumber(self, strIn):
    #     self.extractListCol()
    #     for x in self.colName:
    #         if x == strIn:
    #             colIndex = self.colName.index(x)+1
    #     return colIndex



    #function to extract list of keywords in all the research papers
    def extractDictOfKeywords(self):
        dh = DataHandler(self.path)
        df = dh.get_dataframe()
        # maxRow = 1000
        # keyIndex = self.extractColNumber('keywords')
        rowIndices = df.index.tolist()
        i=0
        # for rowIndex in df.index.tolist()
        for keyList in df['keywords']:
            if(i<1000): 
                self.dictKeywords[rowIndices[i]] = [keyword.strip() for keyword in re.split(";", keyList) if keyword]
                i=i+1

        return self.dictKeywords


    #function to create list of keywords in all papers
    def generateKeywordsList(self):
        self.dictKeywords = self.extractDictOfKeywords()
        # print(len(self.dictKeywords))
        self.keywordsList = list(set().union(*self.dictKeywords.values()))
        return self.keywordsList
    
    
    #function to find combinations of any 2 keywords
    def rSubset(self): 
        self.keywordsList = self.generateKeywordsList()
        self.keysPair = list(combinations(self.keywordsList, self.subsetSize)) 
        return self.keysPair
    
    
    #function to extract weight
    def pathWeight(self, x, y, dictKeywords, idList):
        edgeWeight = 0
        for idNo in idList:
            if(x in dictKeywords[idNo] and y in dictKeywords[idNo]):
                edgeWeight = edgeWeight + 1
        return edgeWeight
    
    
    #function to generate dictionary having keywords as keys and list of paper ids as value list 
    def generateDictKeyVsPid(self):
        self.keywordsList = self.generateKeywordsList()
        self.idList = self.dictKeywords.keys()
        self.dictKeyVsPid = {key:[pid for pid in self.idList if key in self.dictKeywords[pid]] for key in self.keywordsList}
        return self.dictKeyVsPid

In [4]:
# Driver Function 
if __name__ == "__main__": 
    path = "drive/My Drive/Colab Notebooks/Data.xlsx"
    netGen = keywordNodeNetGen(path)
    dictKeyVsPid = netGen.generateDictKeyVsPid()
    dictKeywords = netGen.extractDictOfKeywords()
    idList = dictKeywords.keys()
    nodesCount = len(netGen.generateKeywordsList())
    print(nodesCount)
    keyPairList = netGen.rSubset()
    g = Graph(nodesCount) 
    for i in keyPairList:
        edgeWeight = netGen.pathWeight(i[0],i[1],dictKeywords,idList)
        if edgeWeight>1:
            g.addEdge(i[0], i[1], edgeWeight)
        
        
    g.printGraph()

4381
DNA methylation is connected to 
gene expression(Weight = 2) 


gene expression is connected to 
DNA methylation(Weight = 2) 


security is connected to 
cloud computing(Weight = 4) 
privacy(Weight = 2) 


cloud computing is connected to 
security(Weight = 4) 
privacy(Weight = 2) 
virtualization(Weight = 2) 


privacy is connected to 
security(Weight = 2) 
cloud computing(Weight = 2) 


Resting-state fMRI is connected to 
Alzheimer's disease (AD)(Weight = 2) 
Mild cognitive impairment (MCI)(Weight = 2) 
Machine learning approach(Weight = 2) 


Alzheimer's disease (AD) is connected to 
Resting-state fMRI(Weight = 2) 
Mild cognitive impairment (MCI)(Weight = 2) 
Machine learning approach(Weight = 2) 


Mild cognitive impairment (MCI) is connected to 
Resting-state fMRI(Weight = 2) 
Alzheimer's disease (AD)(Weight = 2) 
Machine learning approach(Weight = 2) 


Machine learning approach is connected to 
Resting-state fMRI(Weight = 2) 
Alzheimer's disease (AD)(Weight = 2) 
Mild cogniti

In [5]:
#Node structure for graph
class Node:

    def __init__(self,src,dest,wt):
        self.src = src
        self.dest = dest
        self.wt = wt


#Class to represent an un-directed graph using adjacency list representation 
class Graph: 
   
    def __init__(self,vertices): 
        self.V = vertices #No. of vertices 
        self.V_org = vertices 
        self.graph = defaultdict(list) # default dictionary to store graph 
        
        
    # function to add an edge to graph 
    def addEdge(self,u,v,w): 
        self.graph[u].append(Node(u,v,w))
        self.graph[v].append(Node(v,u,w))

        
    #function to print graph
    def printGraph(self):
        s = ""
        for i in self.graph:
            s = s + str(i) + " is connected to "
            print(str(i) + " is connected to ")
            for node in self.graph[i]:
                s = s + str(node.dest) + "(Weight = " + str(node.wt) + ")" + " "
                print(str(node.dest) + "(Weight = " + str(node.wt) + ")" + " ")
            s = s + "\n"
            print("\n")
        return s




class DataHandler:
    def __init__(self, path):
        self.type = pathlib.Path(path).suffix
        self.dataset_location = path
        
#         Use the pandas dataframe to get columns 
        if self.type == '.csv':
#             self.df = pd.read_csv(location, nrows=100)
            self.df = pd.read_csv(path)
            self.df = self.df[self.df['Domain'] == "CS "]
        elif self.type == '.xlsx':
            self.df = pd.read_excel(path)
            self.df = self.df[self.df['Domain'] == "CS "]

    def get_dataframe(self):
        return self.df

            
class keywordNodeNetGen:
    
    def __init__(self, path):
        self.path = path
        self.subsetSize = 2
        self.keysPair = []
        self.keywordsList = []
        self.dictKeywords = {}
        self.dictKeyVsPid = {}
        self.idList = []
        self.colName = []
    #     self.wbObj = openpyxl.load_workbook(self.path)
    #     self.sheetObj = self.wbObj.active    

    # #function to extract list of column headers from .xlsx
    # def extractListCol(self):
    #     maxCol = self.sheetObj.max_column    
    #     # Loop will print all columns name 
    #     for i in range(1, maxCol + 1): 
    #         cellObj = self.sheetObj.cell(row = 1, column = i) 
    #         self.colName.append(cellObj.value) 
    

    # #function to extract column index from its header
    # def extractColNumber(self, strIn):
    #     self.extractListCol()
    #     for x in self.colName:
    #         if x == strIn:
    #             colIndex = self.colName.index(x)+1
    #     return colIndex



    #function to extract list of keywords in all the research papers
    def extractDictOfKeywords(self):
        dh = DataHandler(self.path)
        df = dh.get_dataframe()
        # maxRow = 1000
        # keyIndex = self.extractColNumber('keywords')
        rowIndices = df.index.tolist()
        i=0
        # for rowIndex in df.index.tolist()
        for keyList in df['Yake keywords']:
            if(i<1000): 
                self.dictKeywords[rowIndices[i]] = [keyword.strip() for keyword in re.split(";", keyList) if keyword]
                i=i+1

        return self.dictKeywords


    #function to create list of keywords in all papers
    def generateKeywordsList(self):
        self.dictKeywords = self.extractDictOfKeywords()
        # print(len(self.dictKeywords))
        self.keywordsList = list(set().union(*self.dictKeywords.values()))
        return self.keywordsList
    
    
    #function to find combinations of any 2 keywords
    def rSubset(self): 
        self.keywordsList = self.generateKeywordsList()
        self.keysPair = list(combinations(self.keywordsList, self.subsetSize)) 
        return self.keysPair
    
    
    #function to extract weight
    def pathWeight(self, x, y, dictKeywords, idList):
        edgeWeight = 0
        for idNo in idList:
            if(x in dictKeywords[idNo] and y in dictKeywords[idNo]):
                edgeWeight = edgeWeight + 1
        return edgeWeight
    
    
    #function to generate dictionary having keywords as keys and list of paper ids as value list 
    def generateDictKeyVsPid(self):
        self.keywordsList = self.generateKeywordsList()
        self.idList = self.dictKeywords.keys()
        self.dictKeyVsPid = {key:[pid for pid in self.idList if key in self.dictKeywords[pid]] for key in self.keywordsList}
        return self.dictKeyVsPid

In [6]:
# Driver Function 
if __name__ == "__main__": 
    path = "drive/My Drive/Colab Notebooks/Data.xlsx"
    netGenyake = keywordNodeNetGen(path)
    dictKeyVsPidyake = netGenyake.generateDictKeyVsPid()
    dictKeywordsyake = netGenyake.extractDictOfKeywords()
    idListyake = dictKeywordsyake.keys()
    nodesCountyake = len(netGenyake.generateKeywordsList())
    print(nodesCountyake)
    keyPairListyake = netGenyake.rSubset()
    gyake = Graph(nodesCountyake) 
    for i in keyPairListyake:
        edgeWeightyake = netGenyake.pathWeight(i[0],i[1],dictKeywordsyake,idListyake)
        if edgeWeightyake>1:
            gyake.addEdge(i[0], i[1], edgeWeightyake)
        
        
    gyake.printGraph()

4222
series data is connected to 
time series(Weight = 2) 


time series is connected to 
series data(Weight = 2) 


language processing is connected to 
natural language(Weight = 2) 


natural language is connected to 
language processing(Weight = 2) 


feature is connected to 
selection methods(Weight = 2) 
feature selection(Weight = 2) 
selection(Weight = 2) 


selection methods is connected to 
feature(Weight = 2) 
feature selection(Weight = 2) 
selection(Weight = 2) 


feature selection is connected to 
feature(Weight = 2) 
selection methods(Weight = 2) 
selection(Weight = 2) 


selection is connected to 
feature(Weight = 2) 
selection methods(Weight = 2) 
feature selection(Weight = 2) 


security is connected to 
network security(Weight = 2) 
network(Weight = 3) 


network security is connected to 
security(Weight = 2) 
intrusion detection(Weight = 2) 
network(Weight = 2) 


network is connected to 
security(Weight = 3) 
sdn(Weight = 2) 
algorithm(Weight = 2) 
network security(We

In [7]:
#Node structure for graph
class Node:

    def __init__(self,src,dest,wt):
        self.src = src
        self.dest = dest
        self.wt = wt


#Class to represent an un-directed graph using adjacency list representation 
class Graph: 
   
    def __init__(self,vertices): 
        self.V = vertices #No. of vertices 
        self.V_org = vertices 
        self.graph = defaultdict(list) # default dictionary to store graph 
        
        
    # function to add an edge to graph 
    def addEdge(self,u,v,w): 
        self.graph[u].append(Node(u,v,w))
        self.graph[v].append(Node(v,u,w))

        
    #function to print graph
    def printGraph(self):
        s = ""
        for i in self.graph:
            s = s + str(i) + " is connected to "
            print(str(i) + " is connected to ")
            for node in self.graph[i]:
                s = s + str(node.dest) + "(Weight = " + str(node.wt) + ")" + " "
                print(str(node.dest) + "(Weight = " + str(node.wt) + ")" + " ")
            s = s + "\n"
            print("\n")
        return s




class DataHandler:
    def __init__(self, path):
        self.type = pathlib.Path(path).suffix
        self.dataset_location = path
        
#         Use the pandas dataframe to get columns 
        if self.type == '.csv':
#             self.df = pd.read_csv(location, nrows=100)
            self.df = pd.read_csv(path)
            self.df = self.df[self.df['Domain'] == "CS "]
        elif self.type == '.xlsx':
            self.df = pd.read_excel(path)
            self.df = self.df[self.df['Domain'] == "CS "]

    def get_dataframe(self):
        return self.df

            
class keywordNodeNetGen:
    
    def __init__(self, path):
        self.path = path
        self.subsetSize = 2
        self.keysPair = []
        self.keywordsList = []
        self.dictKeywords = {}
        self.dictKeyVsPid = {}
        self.idList = []
        self.colName = []
    #     self.wbObj = openpyxl.load_workbook(self.path)
    #     self.sheetObj = self.wbObj.active    

    # #function to extract list of column headers from .xlsx
    # def extractListCol(self):
    #     maxCol = self.sheetObj.max_column    
    #     # Loop will print all columns name 
    #     for i in range(1, maxCol + 1): 
    #         cellObj = self.sheetObj.cell(row = 1, column = i) 
    #         self.colName.append(cellObj.value) 
    

    # #function to extract column index from its header
    # def extractColNumber(self, strIn):
    #     self.extractListCol()
    #     for x in self.colName:
    #         if x == strIn:
    #             colIndex = self.colName.index(x)+1
    #     return colIndex



    #function to extract list of keywords in all the research papers
    def extractDictOfKeywords(self):
        dh = DataHandler(self.path)
        df = dh.get_dataframe()
        # maxRow = 1000
        # keyIndex = self.extractColNumber('keywords')
        rowIndices = df.index.tolist()
        i=0
        # for rowIndex in df.index.tolist()
        for keyList in df['Rake keywords']:
            if(i<1000): 
                self.dictKeywords[rowIndices[i]] = [keyword.strip() for keyword in re.split(";", keyList) if keyword]
                i=i+1

        return self.dictKeywords


    #function to create list of keywords in all papers
    def generateKeywordsList(self):
        self.dictKeywords = self.extractDictOfKeywords()
        # print(len(self.dictKeywords))
        self.keywordsList = list(set().union(*self.dictKeywords.values()))
        return self.keywordsList
    
    
    #function to find combinations of any 2 keywords
    def rSubset(self): 
        self.keywordsList = self.generateKeywordsList()
        self.keysPair = list(combinations(self.keywordsList, self.subsetSize)) 
        return self.keysPair
    
    
    #function to extract weight
    def pathWeight(self, x, y, dictKeywords, idList):
        edgeWeight = 0
        for idNo in idList:
            if(x in dictKeywords[idNo] and y in dictKeywords[idNo]):
                edgeWeight = edgeWeight + 1
        return edgeWeight
    
    
    #function to generate dictionary having keywords as keys and list of paper ids as value list 
    def generateDictKeyVsPid(self):
        self.keywordsList = self.generateKeywordsList()
        self.idList = self.dictKeywords.keys()
        self.dictKeyVsPid = {key:[pid for pid in self.idList if key in self.dictKeywords[pid]] for key in self.keywordsList}
        return self.dictKeyVsPid

In [8]:
# Driver Function 
if __name__ == "__main__": 
    path = "drive/My Drive/Colab Notebooks/Data.xlsx"
    netGen = keywordNodeNetGen(path)
    dictKeyVsPid = netGen.generateDictKeyVsPid()
    dictKeywords = netGen.extractDictOfKeywords()
    idList = dictKeywords.keys()
    nodesCount = len(netGen.generateKeywordsList())
    print(nodesCount)
    keyPairList = netGen.rSubset()
    g = Graph(nodesCount) 
    for i in keyPairList:
        edgeWeight = netGen.pathWeight(i[0],i[1],dictKeywords,idList)
        if edgeWeight>1:
            g.addEdge(i[0], i[1], edgeWeight)
        
        
    g.printGraph()

4222
series data is connected to 
time series(Weight = 2) 


time series is connected to 
series data(Weight = 2) 


language processing is connected to 
natural language(Weight = 2) 


natural language is connected to 
language processing(Weight = 2) 


feature is connected to 
selection methods(Weight = 2) 
feature selection(Weight = 2) 
selection(Weight = 2) 


selection methods is connected to 
feature(Weight = 2) 
feature selection(Weight = 2) 
selection(Weight = 2) 


feature selection is connected to 
feature(Weight = 2) 
selection methods(Weight = 2) 
selection(Weight = 2) 


selection is connected to 
feature(Weight = 2) 
selection methods(Weight = 2) 
feature selection(Weight = 2) 


security is connected to 
network security(Weight = 2) 
network(Weight = 3) 


network security is connected to 
security(Weight = 2) 
intrusion detection(Weight = 2) 
network(Weight = 2) 


network is connected to 
security(Weight = 3) 
sdn(Weight = 2) 
algorithm(Weight = 2) 
network security(We