In [10]:
import json
import numpy as np
from collections import defaultdict,Counter
from sklearn import preprocessing

In [11]:
def average(lis):
    return sum(lis) / len(lis)

In [12]:
IDList = []
labelList = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        fosList = defaultdict(int)
        for fos in data.get('fos',[]):
            if (fos['name'] == 'Data mining' or fos['name'] == 'Data science'):
                fosList['Data'] = max(fosList['Data'], fos['w'])
            if (fos['name'] == 'Natural language processing' or fos['name'] == 'Speech recognition'):
                fosList['NLP'] = max(fosList['NLP'], fos['w'])
            if (fos['name'] == 'Computer vision'):
                fosList['CV'] = fos['w']
        if (len(fosList.keys())):
            IDList.append(data['id'])
            fosList = [(key, value) for key, value in fosList.items()]
            fosList = sorted(fosList, key = lambda x: x[1],    reverse=True)
            labelList.append(fosList[0][0])
assert len(labelList) == len(IDList)

In [13]:
counter = Counter(labelList)
counter

Counter({'CV': 142790, 'NLP': 57416, 'Data': 117215})

In [None]:
## Load embeddings as X
embeddingFileName = './data/dblpAbstract_2Thresholded_FT_Embeddings.json'
embeddingDict = dict()
with open(embeddingFileName, 'r') as file:
    for line in file:
        data = json.loads(line)
        embeddingDict[data['id']] = data['embedding']

embeddings = []
for id in IDList:
    embeddings.append(embeddingDict[id])
del embeddingDict

In [14]:
embeddings = [] 
from gensim import models
outFileName = './models/node2vec_USE_2Citation_Embeddings_WL_' + str(8) + '_NN_' + str(4) + '.kv'
model = models.keyedvectors.KeyedVectors.load_word2vec_format('./models/node2vec_USE_2Citation_Embeddings.kv')
for id in IDList:
    if id in model.wv.vocab:
        embeddings.append(model.wv[id])
    else:
        embeddings.append([0] * 128)
del model

  
  import sys


In [15]:
X = np.asarray(embeddings)
del embeddings

In [None]:
from imblearn.under_sampling import RandomUnderSampler
samplingDict = dict()
samplingDict['CV'] = 30000
samplingDict['NLP'] = 30000
samplingDict['Data'] = 30000
rus = RandomUnderSampler(sampling_strategy = samplingDict,random_state=0)
assert X.shape[0] == len(labelList)
X,Y= rus.fit_resample(X, labelList)
counter = Counter(Y)
counter

In [16]:
le = preprocessing.LabelEncoder()
le.fit(labelList)
Y = le.transform(labelList)
counter = Counter(Y)
print(counter)

Counter({0: 142790, 1: 117215, 2: 57416})


In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier


names = [
 "Random Forest", "Neural Net", "AdaBoost", "Linear SVC" ]

classifiers = [
    RandomForestClassifier(verbose=True, n_jobs = -1),
    MLPClassifier(verbose=True, early_stopping=True),
    AdaBoostClassifier(),
    OneVsRestClassifier(BaggingClassifier(LinearSVC(),n_jobs = -1))]


In [18]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, KFold

kfold = KFold(n_splits=5, shuffle=True)
for name, clf in zip(names, classifiers):
    precScores = []
    recallScores = []
    f1Scores = []
    count = 1
    for train_index, test_index in kfold.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        print('Fitting: ', count)
        clf.fit(X_train, y_train)
        print('count ', count)
        y_pred = clf.predict(X_test)
        prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precScores.append(prec)
        recallScores.append(recall)
        f1Scores.append(fscore)
        count += 1
    print('Name', name,'. Avg Precision: ', average(precScores), '. Avg Recall: ', average(recallScores), '. Avg F-1 Score: ', average(f1Scores) )


Fitting:  1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   35.9s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  1


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.3s finished


Fitting:  2


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   37.8s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  2


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.3s finished


Fitting:  3


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   36.9s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  3


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.2s finished


Fitting:  4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   39.5s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  4


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.3s finished


Fitting:  5


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   38.2s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  5


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.3s finished


Name Random Forest . Avg Precision:  0.9054602580821862 . Avg Recall:  0.9067957039648548 . Avg F-1 Score:  0.9055497816508125
Fitting:  1
Iteration 1, loss = 0.38080415
Validation score: 0.899071
Iteration 2, loss = 0.29195660
Validation score: 0.906041
Iteration 3, loss = 0.27632414
Validation score: 0.905686
Iteration 4, loss = 0.26920996
Validation score: 0.909782
Iteration 5, loss = 0.26430561
Validation score: 0.908522
Iteration 6, loss = 0.26180344
Validation score: 0.905883
Iteration 7, loss = 0.25885140
Validation score: 0.911751
Iteration 8, loss = 0.25690439
Validation score: 0.905923
Iteration 9, loss = 0.25470445
Validation score: 0.911711
Iteration 10, loss = 0.25354121
Validation score: 0.912460
Iteration 11, loss = 0.25163821
Validation score: 0.912735
Iteration 12, loss = 0.25070637
Validation score: 0.913995
Iteration 13, loss = 0.24920658
Validation score: 0.908640
Iteration 14, loss = 0.24881840
Validation score: 0.908167
Iteration 15, loss = 0.24858338
Validation s

Iteration 18, loss = 0.24691243
Validation score: 0.912263
Iteration 19, loss = 0.24543926
Validation score: 0.910097
Iteration 20, loss = 0.24505414
Validation score: 0.912578
Iteration 21, loss = 0.24460059
Validation score: 0.912499
Iteration 22, loss = 0.24362876
Validation score: 0.911869
Iteration 23, loss = 0.24353138
Validation score: 0.913799
Iteration 24, loss = 0.24315979
Validation score: 0.913562
Iteration 25, loss = 0.24305361
Validation score: 0.912853
Iteration 26, loss = 0.24228454
Validation score: 0.912932
Iteration 27, loss = 0.24230337
Validation score: 0.909900
Iteration 28, loss = 0.24125927
Validation score: 0.910766
Iteration 29, loss = 0.24160167
Validation score: 0.913995
Iteration 30, loss = 0.24139705
Validation score: 0.912538
Iteration 31, loss = 0.24026356
Validation score: 0.914074
Iteration 32, loss = 0.24062367
Validation score: 0.913759
Iteration 33, loss = 0.24101429
Validation score: 0.913247
Iteration 34, loss = 0.23934614
Validation score: 0.9121

In [19]:
from gensim import models

model = models.keyedvectors.KeyedVectors.load_word2vec_format('./models/node2vec_Embeddings.kv')

In [22]:
len(model.wv.vocab)

  """Entry point for launching an IPython kernel.


471633