In [34]:
from google.colab import drive
# drive.mount('/content/gdrive')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
import re
import os
import glob
import pandas as pd
import pathlib
from itertools import combinations 
from collections import defaultdict 

In [36]:
#Node structure for graph
class Node:

    def __init__(self,src,dest,wt):
        self.src = src
        self.dest = dest
        self.wt = wt


#Class to represent an un-directed graph using adjacency list representation 
class Graph: 
   
    def __init__(self,vertices): 
        self.V = vertices #No. of vertices 
        self.V_org = vertices 
        self.graph = defaultdict(list) # default dictionary to store graph 
        
        
    # function to add an edge to graph 
    def addEdge(self,u,v,w): 
        self.graph[u].append(Node(u,v,w))
        self.graph[v].append(Node(v,u,w))

        
    #function to print graph
    def printGraph(self):
        s = ""
        for i in self.graph:
            s = s + str(i) + " is connected to "
            print(str(i) + " is connected to ")
            for node in self.graph[i]:
                s = s + str(node.dest) + "(Weight = " + str(node.wt) + ")" + " "
                print(str(node.dest) + "(Weight = " + str(node.wt) + ")" + " ")
            s = s + "\n"
            print("\n")
        return s

    
    #function to get BFS results for a given node till the given level
    def BFS(self, s, max_levels):
        visited = set()
 
        queue = []
 
        queue.append(s)
        visited.add(s)
        level = 0
        result = {}
        while queue:
            aux = []
            result[level] = []
            
            while queue:
                s = queue.pop(0)
                visited.add(s)
                result[level].append(s)
                for node in self.graph[s]:
                    if node.dest not in visited:
                        aux.append(node.dest)
                        visited.add(node.dest)
            level += 1
            if level > max_levels:
                break
            for node in aux:
                queue.append(node)
            
        return result




class DataHandler:
    def __init__(self, path):
        self.type = pathlib.Path(path).suffix
        self.dataset_location = path
        
#         Use the pandas dataframe to get columns 
        if self.type == '.csv':
#             self.df = pd.read_csv(location, nrows=100)
            self.df = pd.read_csv(path)
            self.df = self.df[self.df['Domain'] == "CS "]
        elif self.type == '.xlsx':
            self.df = pd.read_excel(path)
            self.df = self.df[self.df['Domain'] == "CS "]

    def get_dataframe(self):
        return self.df

            
class keywordNodeNetGen:
    
    def __init__(self, path):
        self.path = path
        self.subsetSize = 2
        self.keysPair = []
        self.keywordsList = []
        self.dictKeywords = {}
        self.dictKeyVsPid = {}
        self.idList = []
        self.colName = []
    #     self.wbObj = openpyxl.load_workbook(self.path)
    #     self.sheetObj = self.wbObj.active    

    # #function to extract list of column headers from .xlsx
    # def extractListCol(self):
    #     maxCol = self.sheetObj.max_column    
    #     # Loop will print all columns name 
    #     for i in range(1, maxCol + 1): 
    #         cellObj = self.sheetObj.cell(row = 1, column = i) 
    #         self.colName.append(cellObj.value) 
    

    # #function to extract column index from its header
    # def extractColNumber(self, strIn):
    #     self.extractListCol()
    #     for x in self.colName:
    #         if x == strIn:
    #             colIndex = self.colName.index(x)+1
    #     return colIndex



    #function to extract list of keywords in all the research papers
    def extractDictOfKeywords(self):
        dh = DataHandler(self.path)
        df = dh.get_dataframe()
        # maxRow = 1000
        # keyIndex = self.extractColNumber('keywords')
        rowIndices = df.index.tolist()
        i=0
        # for rowIndex in df.index.tolist()
        for keyList in df['keywords']:
            if(i<1000): 
                self.dictKeywords[rowIndices[i]] = [keyword.strip().lower() for keyword in re.split(";", keyList) if keyword]
                i=i+1

        return self.dictKeywords


    #function to create list of keywords in all papers
    def generateKeywordsList(self):
        self.dictKeywords = self.extractDictOfKeywords()
        # print(len(self.dictKeywords))
        self.keywordsList = list(set().union(*self.dictKeywords.values()))
        return self.keywordsList
    
    
    #function to find combinations of any 2 keywords
    def rSubset(self): 
        self.keywordsList = self.generateKeywordsList()
        self.keysPair = list(combinations(self.keywordsList, self.subsetSize)) 
        return self.keysPair
    
    
    #function to extract weight
    def pathWeight(self, x, y, dictKeywords, idList):
        edgeWeight = 0
        for idNo in idList:
            if(x in dictKeywords[idNo] and y in dictKeywords[idNo]):
                edgeWeight = edgeWeight + 1
        return edgeWeight
    
    
    #function to generate dictionary having keywords as keys and list of paper ids as value list 
    def generateDictKeyVsPid(self):
        self.keywordsList = self.generateKeywordsList()
        self.idList = self.dictKeywords.keys()
        self.dictKeyVsPid = {key:[pid for pid in self.idList if key in self.dictKeywords[pid]] for key in self.keywordsList}
        return self.dictKeyVsPid

In [37]:
# Driver Function 
if __name__ == "__main__": 
    path = "drive/My Drive/Colab Notebooks/Data.xlsx"
    netGen = keywordNodeNetGen(path)
    dictKeyVsPid = netGen.generateDictKeyVsPid()
    dictKeywords = netGen.extractDictOfKeywords()
    idList = dictKeywords.keys()
    nodesCount = len(netGen.generateKeywordsList())
    print(nodesCount)
    keyPairList = netGen.rSubset()
    g = Graph(nodesCount) 
    for i in keyPairList:
        edgeWeight = netGen.pathWeight(i[0],i[1],dictKeywords,idList)
        if edgeWeight>1:
            g.addEdge(i[0], i[1], edgeWeight)
        
        
    g.printGraph()

4160
image segmentation is connected to 
feature extraction(Weight = 2) 


feature extraction is connected to 
image segmentation(Weight = 2) 
medical image processing(Weight = 2) 


decryption is connected to 
encryption(Weight = 2) 
cryptography(Weight = 2) 


encryption is connected to 
decryption(Weight = 2) 


cryptography is connected to 
decryption(Weight = 2) 
authentication(Weight = 2) 
chaos(Weight = 3) 


mapreduce is connected to 
big data(Weight = 2) 
machine learning(Weight = 2) 
hadoop(Weight = 4) 


big data is connected to 
mapreduce(Weight = 2) 
nosql(Weight = 4) 
hadoop(Weight = 4) 
relational database(Weight = 2) 
structured data(Weight = 2) 
cloud computing(Weight = 4) 
parallel computing(Weight = 3) 


machine learning is connected to 
mapreduce(Weight = 2) 
pattern recognition(Weight = 2) 
deep learning(Weight = 2) 
feature selection(Weight = 2) 
image analysis(Weight = 2) 
cell segmentation(Weight = 2) 
classification(Weight = 2) 
data mining(Weight = 3) 
suppor

In [49]:
queryKeyAuthor = 'Cloud computing'
queryKeyAuthor = queryKeyAuthor.strip().lower()
queryDictAuthor = g.BFS(queryKeyAuthor, 3)

querySetAuthor = []

querySetAuthor.extend([item for sublist in queryDictAuthor.values() for item in sublist])

In [50]:
print(querySetAuthor)

['cloud computing', 'optimization', 'virtualization', 'privacy', 'big data', 'security', 'distributed computing', 'compliance', 'performance', 'resource sharing', 'mapreduce', 'nosql', 'hadoop', 'relational database', 'structured data', 'parallel computing', 'simulation', 'authentication', 'network', 'reachability', 'machine learning', 'domain decomposition', 'software', 'cryptography']


In [51]:
dh = DataHandler('drive/My Drive/Colab Notebooks/Data.xlsx')
dataset = dh.get_dataframe()
dataset = dataset.reset_index()

In [72]:
# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and rake extracted keywords from abstract
def predict_rake(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['Rake keywords'])
    for i in range(1000):
        val = dataset['Rake keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and yake extracted keywords from abstract
def predict_yake(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['Yake keywords'])
    for i in range(1000):
        val = dataset['Yake keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and author labelled keywords from abstract
def predict_author_labelled(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['keywords'])
    for i in range(1000):
        val = dataset['keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

def calc_common(input_keywords, keywords):
    cou = 0
    for w1 in input_keywords:
        for w2 in keywords:
            if(w1.strip().lower() == w2.strip().lower()):
                cou+=1
    return cou

In [73]:
resultAuthor = predict_author_labelled(querySetAuthor, 1)

In [74]:
print(resultAuthor)

{791: [5, 'relational databases'], 914: [4, 'network security'], 902: [4, 'distributed computing'], 583: [4, 'relational databases'], 919: [3, 'distributed computing'], 912: [3, 'distributed computing'], 786: [3, 'distributed computing'], 785: [3, 'relational databases'], 525: [3, 'distributed computing'], 401: [3, 'network security'], 326: [3, 'image processing'], 320: [3, 'data structures'], 318: [3, 'distributed computing'], 929: [2, 'parallel computing'], 921: [2, 'distributed computing'], 917: [2, 'relational databases'], 913: [2, 'distributed computing'], 910: [2, 'distributed computing'], 905: [2, 'network security'], 901: [2, 'distributed computing'], 900: [2, 'distributed computing'], 897: [2, 'distributed computing'], 877: [2, 'symbolic computation'], 792: [2, 'operating systems'], 788: [2, 'distributed computing'], 784: [2, 'distributed computing'], 782: [2, 'relational databases'], 779: [2, 'distributed computing'], 765: [2, 'cryptography'], 720: [2, 'relational databases']