In [1]:
from google.colab import drive
# drive.mount('/content/gdrive')
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install jsonpickle

Collecting jsonpickle
  Downloading https://files.pythonhosted.org/packages/bb/1a/f2db026d4d682303793559f1c2bb425ba3ec0d6fd7ac63397790443f2461/jsonpickle-2.0.0-py2.py3-none-any.whl
Installing collected packages: jsonpickle
Successfully installed jsonpickle-2.0.0


In [3]:
import re
import os
import glob
import json
import jsonpickle 
import math
import nltk
import pandas as pd
import pathlib
import numpy as np # linear algebra
from itertools import combinations 
from collections import defaultdict 
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from scipy import spatial
from nltk.metrics import edit_distance
from collections import defaultdict 

In [4]:
nltk.download('stopwords')  
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
def tokenize(q1, q2):
    """
        q1 and q2 are sentences/questions. Function returns a list of tokens for both.
    """
    return word_tokenize(q1), word_tokenize(q2)


def posTag(q1, q2):
    """
        q1 and q2 are lists. Function returns a list of POS tagged tokens for both.
    """
    return nltk.pos_tag(q1), nltk.pos_tag(q2)


def stemmer(tag_q1, tag_q2):
    """
        tag_q = tagged lists. Function returns a stemmed list.
    """

    stem_q1 = []
    stem_q2 = []

    for token in tag_q1:
        stem_q1.append(stem(token))

    for token in tag_q2:
        stem_q2.append(stem(token))

    return stem_q1, stem_q2

In [6]:
class Lesk(object):

    def __init__(self, sentence):
        self.sentence = sentence
        self.meanings = {}
        for word in sentence:
            self.meanings[word] = ''

    def getSenses(self, word):
        # print word
        return wn.synsets(word.lower())

    def getGloss(self, senses):

        gloss = {}

        for sense in senses:
            gloss[sense.name()] = []

        for sense in senses:
            gloss[sense.name()] += word_tokenize(sense.definition())

        return gloss

    def getAll(self, word):
        senses = self.getSenses(word)

        if senses == []:
            return {word.lower(): senses}

        return self.getGloss(senses)

    def Score(self, set1, set2):
        # Base
        overlap = 0

        # Step
        for word in set1:
            if word in set2:
                overlap += 1

        return overlap

    def overlapScore(self, word1, word2):

        gloss_set1 = self.getAll(word1)
        if self.meanings[word2] == '':
            gloss_set2 = self.getAll(word2)
        else:
            # print 'here'
            gloss_set2 = self.getGloss([wn.synset(self.meanings[word2])])

        # print gloss_set2

        score = {}
        for i in gloss_set1.keys():
            score[i] = 0
            for j in gloss_set2.keys():
                score[i] += self.Score(gloss_set1[i], gloss_set2[j])

        bestSense = None
        max_score = 0
        for i in gloss_set1.keys():
            if score[i] > max_score:
                max_score = score[i]
                bestSense = i

        return bestSense, max_score

    def lesk(self, word, sentence):
        maxOverlap = 0
        context = sentence
        word_sense = []
        meaning = {}

        senses = self.getSenses(word)

        for sense in senses:
            meaning[sense.name()] = 0

        for word_context in context:
            if not word == word_context:
                score = self.overlapScore(word, word_context)
                if score[0] == None:
                    continue
                meaning[score[0]] += score[1]

        if senses == []:
            return word, None, None

        self.meanings[word] = max(meaning.keys(), key=lambda x: meaning[x])

        return word, self.meanings[word], wn.synset(self.meanings[word]).definition()

In [14]:
def path(set1, set2):
    return wn.path_similarity(set1, set2)


def wup(set1, set2):
    return wn.wup_similarity(set1, set2)


def edit(word1, word2):
    if float(edit_distance(word1, word2)) == 0.0:
        return 0.0
    return 1.0 / float(edit_distance(word1, word2))

def computePath(q1, q2):

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0])
            else:
                sim = path(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim

    # print R

    return R

def computeWup(q1, q2):

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0])
            else:
                sim = wup(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim

    # print R

    return R

def overallSim(q1, q2, R):

    sum_X = 0.0
    sum_Y = 0.0

    for i in range(len(q1)):
        max_i = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_i:
                max_i = R[i, j]
        sum_X += max_i

    for i in range(len(q1)):
        max_j = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_j:
                max_j = R[i, j]
        sum_Y += max_j
        
    if (float(len(q1)) + float(len(q2))) == 0.0:
        return 0.0
        
    overall = (sum_X + sum_Y) / (2 * (float(len(q1)) + float(len(q2))))

    return overall

STOP_WORDS = nltk.corpus.stopwords.words()
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")

    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)

    sentence = " ".join(sentence)
    return sentence

In [15]:
def semanticSimilarity(q1, q2):

    tokens_q1, tokens_q2 = tokenize(q1, q2)
    # stem_q1, stem_q2 = stemmer(tokens_q1, tokens_q2)
    tag_q1, tag_q2 = posTag(tokens_q1, tokens_q2)

    sentence = []
    for i, word in enumerate(tag_q1):
        if 'NN' in word[1] or 'JJ' in word[1] or 'VB' in word[1]:
            sentence.append(word[0])

    sense1 = Lesk(sentence)
    sentence1Means = []
    for word in sentence:
        sentence1Means.append(sense1.lesk(word, sentence))

    sentence = []
    for i, word in enumerate(tag_q2):
        if 'NN' in word[1] or 'JJ' in word[1] or 'VB' in word[1]:
            sentence.append(word[0])

    sense2 = Lesk(sentence)
    sentence2Means = []
    for word in sentence:
        sentence2Means.append(sense2.lesk(word, sentence))
    # for i, word in enumerate(sentence1Means):
    #     print sentence1Means[i][0], sentence2Means[i][0]

    R1 = computePath(sentence1Means, sentence2Means)
    R2 = computeWup(sentence1Means, sentence2Means)

    R = (R1 + R2) / 2

    # print R

    return overallSim(sentence1Means, sentence2Means, R)

In [9]:
#Node structure for graph
class Node:

    def __init__(self,src,dest,wt):
        self.src = src
        self.dest = dest
        self.wt = wt


#Class to represent an un-directed graph using adjacency list representation 
class Graph: 
   
    def __init__(self,vertices): 
        self.V = vertices #No. of vertices 
        self.V_org = vertices 
        self.graph = defaultdict(list) # default dictionary to store graph 
        
        
    # function to add an edge to graph 
    def addEdge(self,u,v,w): 
        self.graph[u].append(Node(u,v,w))
        self.graph[v].append(Node(v,u,w))

        
    #function to print graph
    def printGraph(self):
        s = ""
        for i in self.graph:
            s = s + str(i) + " is connected to "
            print(str(i) + " is connected to ")
            for node in self.graph[i]:
                s = s + str(node.dest) + "(Weight = " + str(node.wt) + ")" + " "
                print(str(node.dest) + "(Weight = " + str(node.wt) + ")" + " ")
            s = s + "\n"
            print("\n")
        return s
    

    #function to get BFS results for a given node till the given level
    def BFS(self, s, max_levels):
        visited = set()
 
        queue = []
 
        queue.append(s)
        visited.add(s)
        level = 0
        result = {}
        while queue:
            aux = []
            result[level] = []
            
            while queue:
                s = queue.pop(0)
                visited.add(s)
                result[level].append(s)
                for node in self.graph[s]:
                    if node.dest not in visited:
                        aux.append(node.dest)
                        visited.add(node.dest)
            level += 1
            if level > max_levels:
                break
            for node in aux:
                queue.append(node)
            
        return result   



    def BFSi(self, s, max_levels):
        visited = set()
 
        queue = []
 
        queue.append((s,0,0,1))
        visited.add(s)
        level = 0
        result = {}
        while queue:
            aux = []
            result[level] = []
            
            while queue:
                s = queue.pop(0)
                visited.add(s[0])
                result[level].append(s)
                for node in self.graph[s[0]]:
                    if node.dest not in visited:
                        
#                         Wordnet Similarity
                        q1 = clean_sentence(s[0])
                        q2 = clean_sentence(node.dest)
                        sim = 0
                        sim = semanticSimilarity(q1, q2)

                        sumOfCooccurence = 0
                        for chi in self.graph[node.dest]:
                            if chi.dest in visited:
                                sumOfCooccurence += chi.wt
                        aux.append((node.dest, sumOfCooccurence, level+1, sim))        
                        visited.add(node.dest)
            level += 1
            if level > max_levels:
                break
            for node in aux:
                queue.append(node)
            solution = []
            for key in result:
                for tup in result[key]:
                    if tup[2] != 0 and (tup[1] + np.exp(tup[3]))/np.exp(tup[2]) >= 2:
                        solution.append( ( tup[0], (tup[1] + np.exp(tup[3]))/np.exp(tup[2]) ) )
        return solution            
    
    def export_network(self, filename = "output"):
        filename += ".json"
        obj = jsonpickle.encode(self.graph)
        with open(filename, "w") as outfile: 
            json.dump(obj, outfile)

    def import_network(self, filename = "output"):
        filename += ".json"
        with open(filename) as json_file:
            data = json.load(json_file)
            self.graph = jsonpickle.decode(data)
            self.V = len(self.graph)
            self.V_org = len(self.graph)



class DataHandler:
    def __init__(self, path):
        self.type = pathlib.Path(path).suffix
        self.dataset_location = path
        
#         Use the pandas dataframe to get columns 
        if self.type == '.csv':
#             self.df = pd.read_csv(location, nrows=100)
            self.df = pd.read_csv(path)
            self.df = self.df[self.df['Domain'] == "CS "]
        elif self.type == '.xlsx':
            self.df = pd.read_excel(path)
            self.df = self.df[self.df['Domain'] == "CS "]

    def get_dataframe(self):
        return self.df

            
class keywordNodeNetGen:
    
    def __init__(self, path):
        self.path = path
        self.subsetSize = 2
        self.keysPair = []
        self.keywordsList = []
        self.dictKeywords = {}
        self.dictKeyVsPid = {}
        self.idList = []
        self.colName = []
    #     self.wbObj = openpyxl.load_workbook(self.path)
    #     self.sheetObj = self.wbObj.active    

    # #function to extract list of column headers from .xlsx
    # def extractListCol(self):
    #     maxCol = self.sheetObj.max_column    
    #     # Loop will print all columns name 
    #     for i in range(1, maxCol + 1): 
    #         cellObj = self.sheetObj.cell(row = 1, column = i) 
    #         self.colName.append(cellObj.value) 
    

    # #function to extract column index from its header
    # def extractColNumber(self, strIn):
    #     self.extractListCol()
    #     for x in self.colName:
    #         if x == strIn:
    #             colIndex = self.colName.index(x)+1
    #     return colIndex



    #function to extract list of keywords in all the research papers
    def extractDictOfKeywords(self):
        dh = DataHandler(self.path)
        df = dh.get_dataframe()
        # maxRow = 1000
        # keyIndex = self.extractColNumber('keywords')
        rowIndices = df.index.tolist()
        i=0
        # for rowIndex in df.index.tolist()
        for keyList in df['Yake keywords']:
            if(i<2000): 
                self.dictKeywords[rowIndices[i]] = [keyword.strip().lower() for keyword in re.split(";", keyList) if keyword]
                i=i+1

        return self.dictKeywords


    #function to create list of keywords in all papers
    def generateKeywordsList(self):
        self.dictKeywords = self.extractDictOfKeywords()
        # print(len(self.dictKeywords))
        self.keywordsList = list(set().union(*self.dictKeywords.values()))
        return self.keywordsList
    
    
    #function to find combinations of any 2 keywords
    def rSubset(self): 
        self.keywordsList = self.generateKeywordsList()
        self.keysPair = list(combinations(self.keywordsList, self.subsetSize)) 
        return self.keysPair
    
    
    #function to extract weight
    def pathWeight(self, x, y, dictKeywords, idList):
        edgeWeight = 0
        for idNo in idList:
            if(x in dictKeywords[idNo] and y in dictKeywords[idNo]):
                edgeWeight = edgeWeight + 1
        return edgeWeight
    
    
    #function to generate dictionary having keywords as keys and list of paper ids as value list 
    def generateDictKeyVsPid(self):
        self.keywordsList = self.generateKeywordsList()
        self.idList = self.dictKeywords.keys()
        self.dictKeyVsPid = {key:[pid for pid in self.idList if key in self.dictKeywords[pid]] for key in self.keywordsList}
        return self.dictKeyVsPid

In [10]:
dh = DataHandler('drive/My Drive/Colab Notebooks/Data.xlsx')
dataset = dh.get_dataframe()
dataset = dataset.reset_index()

In [11]:
# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and rake extracted keywords from abstract
def predict_rake(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['Rake keywords'])
    for i in range(1000):
        val = dataset['Rake keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and yake extracted keywords from abstract
def predict_yake(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['Yake keywords'])
    for i in range(1000):
        val = dataset['Yake keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

# Gives sorted dictionary result of papers and number of matching keywords 
# between input keywords and author labelled keywords from abstract
def predict_author_labelled(input_keywords, common_words_count):
    res = {}
    ran = len(dataset['keywords'])
    for i in range(1000):
        val = dataset['keywords'][i]
        keywords = val.split(";")
        com = calc_common(input_keywords, keywords)
        if com >= common_words_count:
            res[i] = com
    res = dict(sorted(res.items(), key = lambda kv:(kv[1], kv[0]), reverse = True))
    for key in res:
        res[key] = [res[key], dataset['area'][key].strip().lower()]
    return res

def calc_common(input_keywords, keywords):
    cou = 0
    for w1 in input_keywords:
        for w2 in keywords:
            if(w1.strip().lower() == w2.strip().lower()):
                cou+=1
    return cou

In [12]:
ga1 = Graph(10000)
ga1.import_network("drive/My Drive/Colab Notebooks/author/output")

ga2 = Graph(10000)
ga2.import_network("drive/My Drive/Colab Notebooks/author/output2k")

gr1 = Graph(10000)
gr1.import_network("drive/My Drive/Colab Notebooks/rake/output")

gr2 = Graph(10000)
gr2.import_network("drive/My Drive/Colab Notebooks/rake/output2k")

gy1 = Graph(10000)
gy1.import_network("drive/My Drive/Colab Notebooks/yake/output")

gy2 = Graph(10000)
gy2.import_network("drive/My Drive/Colab Notebooks/yake/output2k")

In [16]:
queryKeyAuthorList1 = ['cloud computing', 'computer']
querySetAuthor1 = []
for queryKeyAuthor1 in queryKeyAuthorList1:
  queryKeyAuthor1 = queryKeyAuthor1.strip().lower()
  queryDictAuthor1 = ga1.BFSi(queryKeyAuthor1, 3)
  querySetAuthor1.extend([queryKeyAuthor1])
  querySetAuthor1.extend([item[0] for item in queryDictAuthor1])

queryKeyAuthorList2 = ['cloud computing', 'computer']
querySetAuthor2 = []
for queryKeyAuthor2 in queryKeyAuthorList2:
  queryKeyAuthor2 = queryKeyAuthor2.strip().lower()
  queryDictAuthor2 = ga2.BFSi(queryKeyAuthor2, 3)
  querySetAuthor2.extend([queryKeyAuthor2])
  querySetAuthor2.extend([item[0] for item in queryDictAuthor2])

queryKeyYakeList1 = ['cloud computing', 'computer']
querySetYake1 = []
for queryKeyYake1 in queryKeyYakeList1:
  queryKeyYake1 = queryKeyYake1.strip().lower()
  queryDictYake1 = gy1.BFSi(queryKeyYake1, 3)
  querySetYake1.extend([queryKeyYake1])
  querySetYake1.extend([item[0] for item in queryDictYake1])

queryKeyYakeList2 = ['cloud computing', 'computer']
querySetYake2 = []
for queryKeyYake2 in queryKeyYakeList2:
  queryKeyYake2 = queryKeyYake2.strip().lower()
  queryDictYake2 = gy2.BFSi(queryKeyYake2, 3)
  querySetYake2.extend([queryKeyYake2])
  querySetYake2.extend([item[0] for item in queryDictYake2])

queryKeyRakeList1 = ['cloud computing', 'computer']
querySetRake1 = []
for queryKeyRake1 in queryKeyRakeList1:
  queryKeyRake1 = queryKeyRake1.strip().lower()
  queryDictRake1 = gr1.BFSi(queryKeyRake1, 3)
  querySetRake1.extend([queryKeyRake1])
  querySetRake1.extend([item[0] for item in queryDictRake1])

queryKeyRakeList2 = ['cloud computing', 'computer']
querySetRake2 = []
for queryKeyRake2 in queryKeyRakeList2:
  queryKeyRake2 = queryKeyRake2.strip().lower()
  queryDictRake2 = gr2.BFSi(queryKeyRake2, 3)
  querySetRake2.extend([queryKeyRake2])
  querySetRake2.extend([item[0] for item in queryDictRake2])

In [18]:
print(querySetAuthor1)

print(querySetAuthor2)

print(querySetYake1)

print(querySetYake2)

print(querySetRake1)

print(querySetRake2)

['cloud computing', 'security', 'privacy', 'computer']
['cloud computing', 'hadoop', 'distributed computing', 'security', 'privacy', 'big data', 'optimization', 'computer']
['cloud computing', 'cloud', 'information technology', 'computing', 'computer', 'graphics']
['cloud computing', 'cloud', 'distributed computing', 'information technology', 'computing', 'computer', 'programming', 'graphics', 'computer programming', 'computer science']
['cloud computing', 'cloud', 'computing', 'computer', 'computer graphics']
['cloud computing', 'cloud', 'information technology', 'distributed computing', 'computing', 'computer', 'computer programming', 'computer science', 'graphics', 'programming']


In [19]:
resultAuthor1 = predict_author_labelled(querySetAuthor1, 3)

resultAuthor2 = predict_author_labelled(querySetAuthor2, 3)

resultYake1 = predict_yake(querySetYake1, 3)

resultYake2 = predict_yake(querySetYake2, 3)

resultRake1 = predict_rake(querySetRake1, 3)

resultRake2 = predict_rake(querySetRake2, 3)

In [20]:
print(resultAuthor1)

print(resultAuthor2)

print(resultYake1)

print(resultYake2)

print(resultRake1)

print(resultRake2)

{902: [3, 'distributed computing'], 326: [3, 'image processing']}
{902: [3, 'distributed computing'], 786: [3, 'distributed computing'], 326: [3, 'image processing']}
{911: [4, 'network security'], 902: [4, 'distributed computing'], 912: [3, 'distributed computing'], 909: [3, 'distributed computing'], 907: [3, 'distributed computing'], 905: [3, 'network security'], 895: [3, 'distributed computing']}
{911: [4, 'network security'], 902: [4, 'distributed computing'], 912: [3, 'distributed computing'], 909: [3, 'distributed computing'], 907: [3, 'distributed computing'], 905: [3, 'network security'], 900: [3, 'distributed computing'], 895: [3, 'distributed computing']}
{912: [3, 'distributed computing'], 911: [3, 'network security'], 909: [3, 'distributed computing'], 905: [3, 'network security'], 902: [3, 'distributed computing'], 895: [3, 'distributed computing']}
{911: [4, 'network security'], 902: [4, 'distributed computing'], 912: [3, 'distributed computing'], 909: [3, 'distributed co