In [16]:
from google.colab import drive
# drive.mount('/content/gdrive')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import re
import os
import glob
import pandas as pd
import pathlib
from itertools import combinations 
from collections import defaultdict 

In [18]:
#Node structure for graph
class Node:

    def __init__(self,src,dest,wt):
        self.src = src
        self.dest = dest
        self.wt = wt


#Class to represent an un-directed graph using adjacency list representation 
class Graph: 
   
    def __init__(self,vertices): 
        self.V = vertices #No. of vertices 
        self.V_org = vertices 
        self.graph = defaultdict(list) # default dictionary to store graph 
        
        
    # function to add an edge to graph 
    def addEdge(self,u,v,w): 
        self.graph[u].append(Node(u,v,w))
        self.graph[v].append(Node(v,u,w))

        
    #function to print graph
    def printGraph(self):
        s = ""
        for i in self.graph:
            s = s + str(i) + " is connected to "
            print(str(i) + " is connected to ")
            for node in self.graph[i]:
                s = s + str(node.dest) + "(Weight = " + str(node.wt) + ")" + " "
                print(str(node.dest) + "(Weight = " + str(node.wt) + ")" + " ")
            s = s + "\n"
            print("\n")
        return s
    

    #function to get BFS results for a given node till the given level
    def BFS(self, s, max_levels):
        visited = set()
 
        queue = []
 
        queue.append(s)
        visited.add(s)
        level = 0
        result = {}
        while queue:
            aux = []
            result[level] = []
            
            while queue:
                s = queue.pop(0)
                visited.add(s)
                result[level].append(s)
                for node in self.graph[s]:
                    if node.dest not in visited:
                        aux.append(node.dest)
                        visited.add(node.dest)
            level += 1
            if level > max_levels:
                break
            for node in aux:
                queue.append(node)
            
        return result   




class DataHandler:
    def __init__(self, path):
        self.type = pathlib.Path(path).suffix
        self.dataset_location = path
        
#         Use the pandas dataframe to get columns 
        if self.type == '.csv':
#             self.df = pd.read_csv(location, nrows=100)
            self.df = pd.read_csv(path)
            self.df = self.df[self.df['Domain'] == "CS "]
        elif self.type == '.xlsx':
            self.df = pd.read_excel(path)
            self.df = self.df[self.df['Domain'] == "CS "]

    def get_dataframe(self):
        return self.df

            
class keywordNodeNetGen:
    
    def __init__(self, path):
        self.path = path
        self.subsetSize = 2
        self.keysPair = []
        self.keywordsList = []
        self.dictKeywords = {}
        self.dictKeyVsPid = {}
        self.idList = []
        self.colName = []
    #     self.wbObj = openpyxl.load_workbook(self.path)
    #     self.sheetObj = self.wbObj.active    

    # #function to extract list of column headers from .xlsx
    # def extractListCol(self):
    #     maxCol = self.sheetObj.max_column    
    #     # Loop will print all columns name 
    #     for i in range(1, maxCol + 1): 
    #         cellObj = self.sheetObj.cell(row = 1, column = i) 
    #         self.colName.append(cellObj.value) 
    

    # #function to extract column index from its header
    # def extractColNumber(self, strIn):
    #     self.extractListCol()
    #     for x in self.colName:
    #         if x == strIn:
    #             colIndex = self.colName.index(x)+1
    #     return colIndex



    #function to extract list of keywords in all the research papers
    def extractDictOfKeywords(self):
        dh = DataHandler(self.path)
        df = dh.get_dataframe()
        # maxRow = 1000
        # keyIndex = self.extractColNumber('keywords')
        rowIndices = df.index.tolist()
        i=0
        # for rowIndex in df.index.tolist()
        for keyList in df['Yake keywords']:
            if(i<1000): 
                self.dictKeywords[rowIndices[i]] = [keyword.strip().lower() for keyword in re.split(";", keyList) if keyword]
                i=i+1

        return self.dictKeywords


    #function to create list of keywords in all papers
    def generateKeywordsList(self):
        self.dictKeywords = self.extractDictOfKeywords()
        # print(len(self.dictKeywords))
        self.keywordsList = list(set().union(*self.dictKeywords.values()))
        return self.keywordsList
    
    
    #function to find combinations of any 2 keywords
    def rSubset(self): 
        self.keywordsList = self.generateKeywordsList()
        self.keysPair = list(combinations(self.keywordsList, self.subsetSize)) 
        return self.keysPair
    
    
    #function to extract weight
    def pathWeight(self, x, y, dictKeywords, idList):
        edgeWeight = 0
        for idNo in idList:
            if(x in dictKeywords[idNo] and y in dictKeywords[idNo]):
                edgeWeight = edgeWeight + 1
        return edgeWeight
    
    
    #function to generate dictionary having keywords as keys and list of paper ids as value list 
    def generateDictKeyVsPid(self):
        self.keywordsList = self.generateKeywordsList()
        self.idList = self.dictKeywords.keys()
        self.dictKeyVsPid = {key:[pid for pid in self.idList if key in self.dictKeywords[pid]] for key in self.keywordsList}
        return self.dictKeyVsPid

In [19]:
# Driver Function 
if __name__ == "__main__": 
    path = "drive/My Drive/Colab Notebooks/Data.xlsx"
    netGenyake = keywordNodeNetGen(path)
    dictKeyVsPidyake = netGenyake.generateDictKeyVsPid()
    dictKeywordsyake = netGenyake.extractDictOfKeywords()
    idListyake = dictKeywordsyake.keys()
    nodesCountyake = len(netGenyake.generateKeywordsList())
    print(nodesCountyake)
    keyPairListyake = netGenyake.rSubset()
    gyake = Graph(nodesCountyake) 
    for i in keyPairListyake:
        edgeWeightyake = netGenyake.pathWeight(i[0],i[1],dictKeywordsyake,idListyake)
        if edgeWeightyake>1:
            gyake.addEdge(i[0], i[1], edgeWeightyake)
        
        
    gyake.printGraph()

4222
cloud computing is connected to 
computing(Weight = 7) 
information technology(Weight = 3) 
distributed computing(Weight = 2) 
internet(Weight = 2) 
cloud(Weight = 10) 


computing is connected to 
cloud computing(Weight = 7) 
information technology(Weight = 2) 
cloud(Weight = 6) 


information technology is connected to 
cloud computing(Weight = 3) 
computing(Weight = 2) 
cloud(Weight = 3) 


distributed computing is connected to 
cloud computing(Weight = 2) 
cloud(Weight = 2) 


internet is connected to 
cloud computing(Weight = 2) 


cloud is connected to 
cloud computing(Weight = 10) 
computing(Weight = 6) 
information technology(Weight = 3) 
distributed computing(Weight = 2) 


wireless sensor is connected to 
sensor network(Weight = 2) 
sensor networks(Weight = 2) 


sensor network is connected to 
wireless sensor(Weight = 2) 


sensor networks is connected to 
wireless sensor(Weight = 2) 


lung is connected to 
lung cancer(Weight = 2) 
cancer(Weight = 2) 


lung cancer is 

In [34]:
queryKeyYake = 'cloud computing'
queryKeyYake = queryKeyYake.strip().lower()
queryDictYake = gyake.BFS(queryKeyYake, 3)

querySetYake = []

querySetYake.extend([item for sublist in queryDictYake.values() for item in sublist])

In [35]:
print(querySetYake)

['cloud computing', 'computing', 'information technology', 'distributed computing', 'internet', 'cloud']


In [23]:
dh = DataHandler('drive/My Drive/Colab Notebooks/Data.xlsx')
dataset = dh.get_dataframe()
dataset = dataset.reset_index()

In [31]:
# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and rake extracted keywords from abstract
def predict_rake(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['Rake keywords'])
    for i in range(1000):
        val = dataset['Rake keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and yake extracted keywords from abstract
def predict_yake(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['Yake keywords'])
    for i in range(1000):
        val = dataset['Yake keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and author labelled keywords from abstract
def predict_author_labelled(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['keywords'])
    for i in range(1000):
        val = dataset['keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

def calc_common(input_keywords, keywords):
    cou = 0
    for w1 in input_keywords:
        for w2 in keywords:
            if(w1.strip().lower() == w2.strip().lower()):
                cou+=1
    return cou

In [39]:
resultYake_3 = predict_yake(querySetYake, 3)

In [40]:
print(resultYake_3)

{911: [4, 'network security'], 902: [4, 'distributed computing'], 912: [3, 'distributed computing'], 909: [3, 'distributed computing'], 907: [3, 'distributed computing'], 905: [3, 'network security'], 900: [3, 'distributed computing'], 899: [3, 'network security'], 895: [3, 'distributed computing']}


In [41]:
resultYake_1 = predict_yake(querySetYake, 1)

In [42]:
print(resultYake_1)

{911: [4, 'network security'], 902: [4, 'distributed computing'], 912: [3, 'distributed computing'], 909: [3, 'distributed computing'], 907: [3, 'distributed computing'], 905: [3, 'network security'], 900: [3, 'distributed computing'], 899: [3, 'network security'], 895: [3, 'distributed computing'], 913: [2, 'distributed computing'], 910: [2, 'distributed computing'], 906: [2, 'distributed computing'], 903: [2, 'distributed computing'], 898: [2, 'operating systems'], 897: [2, 'distributed computing'], 924: [1, 'parallel computing'], 917: [1, 'relational databases'], 916: [1, 'algorithm design'], 914: [1, 'network security'], 908: [1, 'network security'], 904: [1, 'distributed computing'], 896: [1, 'distributed computing'], 894: [1, 'distributed computing'], 805: [1, 'cryptography'], 790: [1, 'distributed computing'], 788: [1, 'distributed computing'], 603: [1, 'network security'], 572: [1, 'distributed computing'], 511: [1, 'distributed computing'], 451: [1, 'distributed computing'], 4