In [1]:
import json
import re
import numpy as np
from tqdm.notebook import tqdm

In [None]:
AITopLevelTopics = ['Artificial intelligence', 'Computer vision', 'Data mining',
                     'Data science', 'Machine learning', 'Natural language processing',
                     'Pattern recognition', 'Speech recognition']
with open("./data/dblpPaperIDs2Thresholded.json", 'r') as f:
    paperList = json.load(f)
papersUnderConsideration = set(paperList)

In [None]:
import fasttext 
fasttextModel = fasttext.load_model('crawl-300d-2M-subword.bin')

In [None]:
import string
translator = str.maketrans('', '', string.punctuation) 

In [None]:
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    with open('./data/dblpAbstract_2Thresholded_FT_Embeddings.json', 'w') as outfile:
        for line in tqdm(file):
            data = json.loads(line)
            paperID = data['id'] 
            abstractWordList = data.get('abstract',[])
            abstractString = ' '.join(word for word in abstractWordList)
            abstractString = abstractString.replace('\n', ' ').replace('\r', '')
            embedding = fasttextModel.get_sentence_vector(abstractString).tolist()    # while reading use np.asarray to convert to np array
            outDict = dict()
            outDict['id'] = paperID
            outDict['embedding'] = embedding
            json.dump(outDict, outfile)
            outfile.write('\n')

In [None]:
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    with open('./data/dblpTitle_2Thresholded_FT_Embeddings.json', 'w') as outfile:
        for line in tqdm(file):
            data = json.loads(line)
            paperID = data['id'] 
            title = data.get('title','')
            title = title.replace('\n', ' ').replace('\r', '')
            embedding = fasttextModel.get_sentence_vector(title).tolist()    # while reading use np.asarray to convert to np array
            outDict = dict()
            outDict['id'] = paperID
            outDict['embedding'] = embedding
            json.dump(outDict, outfile)
            outfile.write('\n')

## USE Embeddings

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "./module/UnivTrans" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
def embed(inputText):
    return model(inputText)

In [None]:
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    with open('./data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json', 'w') as outfile:
        for line in file:
            data = json.loads(line)
            paperID = data['id'] 
            abstractWordList = data.get('abstract',[])
            abstractString = ' '.join(word for word in abstractWordList)
            abstractString = abstractString.replace('\n', ' ').replace('\r', '')
            embedding = embed([abstractString])[0].numpy()
            outDict = dict()
            outDict['id'] = paperID
            outDict['embedding'] = embedding.tolist()
            json.dump(outDict, outfile)
            outfile.write('\n')

In [None]:
records = []
paperIDs = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
            data = json.loads(line)
            paperID = data['id'] 
            title = data.get('title','')
            title = title.replace('\n', ' ').replace('\r', '')
            records.append(title)
            paperIDs.append(paperID)
assert len(records) == len(paperIDs)

In [None]:
n = 100     # block size
records = [records[i:i + n] for i in range(0, len(records), n)]
paperIDs = [paperIDs[i:i + n] for i in range(0, len(paperIDs), n)]

In [None]:
count = 0
with open('./data/dblp_Title_2Thresholded_USE_Trans_Embeddings.json', 'w') as outfile:
    for i in tqdm(range(len(records))):
#         recordSubList = records[i]
        paperIDSubList = paperIDs[i]
        embeddings = embed(records[i]).numpy().tolist()
        for embedding, paperID in zip(embeddings, paperIDSubList):
        
            outDict = dict()
            outDict['id'] = paperID
            outDict['embedding'] = embedding#.tolist()
            json.dump(outDict, outfile)
            outfile.write('\n')



In [None]:
records = []
paperIDs = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
            data = json.loads(line)
            paperID = data['id'] 
            abstractWordList = data.get('abstract',[])
            abstractString = ' '.join(word for word in abstractWordList)
            abstractString = abstractString.replace('\n', ' ').replace('\r', '')
            title = data.get('title','')
            title = title.replace('\n', ' ').replace('\r', '')
            abstractString = title + '. ' + abstractString
            records.append(abstractString)
            paperIDs.append(paperID)
assert len(records) == len(paperIDs)

In [None]:
n = 100     # block size
records = [records[i:i + n] for i in range(0, len(records), n)]
paperIDs = [paperIDs[i:i + n] for i in range(0, len(paperIDs), n)]

In [None]:
count = 0
with open('./data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json', 'w') as outfile:
    for i in tqdm(range(len(records))):
#         recordSubList = records[i]
        paperIDSubList = paperIDs[i]
        embeddings = embed(records[i]).numpy().tolist()
        for embedding, paperID in zip(embeddings, paperIDSubList):
        
            outDict = dict()
            outDict['id'] = paperID
            outDict['embedding'] = embedding#.tolist()
            json.dump(outDict, outfile)
            outfile.write('\n')



## TF IDF Vectorizer

In [2]:
records = []
paperIDs = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
            data = json.loads(line)
            paperID = data['id'] 
            abstractWordList = data.get('abstract',[])
            abstractString = ' '.join(word for word in abstractWordList)
            abstractString = abstractString.replace('\n', ' ').replace('\r', '')
            title = data.get('title','')
            title = title.replace('\n', ' ').replace('\r', '')
            abstractString = title + '. ' + abstractString
            records.append(abstractString)
            paperIDs.append(paperID)
assert len(records) == len(paperIDs)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [21]:
## Stemming the records, punctuations to be removed later by Tf-Idf vectorizer
from nltk.stem import PorterStemmer
st = PorterStemmer()
stemmedRecords = []
for i in tqdm(range(len(records))):
    stemmedRecords.append(" ".join([st.stem(word) for word in records[i].split()]))


HBox(children=(FloatProgress(value=0.0, max=475839.0), HTML(value='')))




In [29]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [30]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()
lemRecords = []
for i in tqdm(range(len(records))):
    lemRecords.append(" ".join([lemmatizer.lemmatize(word) for word in records[i].split()]))

HBox(children=(FloatProgress(value=0.0, max=475839.0), HTML(value='')))




In [31]:
max_features=2000
max_df=1.0
min_df=3

from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(input='content',
                    encoding='utf-8', decode_error='replace', strip_accents='unicode',
                    lowercase=True, analyzer='word', stop_words='english',
                    token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_-]+\b',
                    ngram_range=(1, 1), max_features=max_features,
                    norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
                    max_df=max_df, min_df=min_df)

In [32]:
X = v.fit_transform(lemRecords)


In [36]:
X = X.astype(np.float32).todense()

In [45]:
with open('./data/dblpAbstract_2Thresholded_TfIdfUni_Embeddings.json', 'w') as outfile:
    for i in range(len(paperIDs)):
        outDict = dict()
        paperID = paperIDs[i]
        embedding =  X[i].tolist()[0]
        outDict['id'] = paperID
        outDict['embedding'] = embedding
        json.dump(outDict, outfile)
        outfile.write('\n')

In [46]:
v = TfidfVectorizer(input='content',
                    encoding='utf-8', decode_error='replace', strip_accents='unicode',
                    lowercase=True, analyzer='word', stop_words='english',
                    token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_-]+\b',
                    ngram_range=(1, 2), max_features=max_features,
                    norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
                    max_df=max_df, min_df=min_df)
X = v.fit_transform(lemRecords)
X = X.astype(np.float32).todense()

In [48]:
with open('./data/dblpAbstract_2Thresholded_TfIdfUni_Embeddings.json', 'w') as outfile:
    for i in tqdm(range(len(paperIDs))):
        outDict = dict()
        paperID = paperIDs[i]
        embedding =  X[i].tolist()[0]
        outDict['id'] = paperID
        outDict['embedding'] = embedding
        json.dump(outDict, outfile)
        outfile.write('\n')

HBox(children=(FloatProgress(value=0.0, max=475839.0), HTML(value='')))




In [None]:
X = X.tolist()

In [41]:
rec = X[0].tolist()[0]

In [42]:
type(rec)

list

In [47]:
 print(v.get_feature_names())

['ability', 'able', 'absolute', 'abstract', 'abstract paper', 'abstraction', 'access', 'according', 'account', 'accuracy', 'accurate', 'accurately', 'achieve', 'achieved', 'achieves', 'achieving', 'acoustic', 'acquired', 'acquisition', 'act', 'action', 'actions', 'activation', 'active', 'activities', 'activity', 'actual', 'adapt', 'adaptation', 'adapted', 'adaptive', 'adaptively', 'add', 'added', 'adding', 'addition', 'additional', 'additionally', 'address', 'address problem', 'addressed', 'adopt', 'adopted', 'advance', 'advanced', 'advantage', 'affect', 'affected', 'age', 'agent', 'agents', 'aggregate', 'aggregation', 'agreement', 'ai', 'aid', 'aim', 'aimed', 'al', 'algorithm', 'algorithm based', 'algorithm proposed', 'algorithmic', 'algorithms', 'alignment', 'allocation', 'allow', 'allowing', 'allows', 'alternative', 'ambiguity', 'analysis', 'analytical', 'analytics', 'analyze', 'analyzed', 'analyzing', 'angle', 'animation', 'annotated', 'annotation', 'anomaly', 'answer', 'answering'

In [35]:
lemmatizer.lemmatize('regions')

'region'

In [2]:
paperIDs = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
            data = json.loads(line)
            paperIDs.append(data['id'] )

with open("./data/orderedPaperIDs.json", 'w') as f:
    json.dump(paperIDs, f)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


