In [7]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from nltk.tokenize import RegexpTokenizer
from nltk import sent_tokenize
import re
import ast
from tqdm import tqdm
import json
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
def get_paper_vocab(paper_tokenized_text):
    vocab = set()
    for sent in paper_tokenized_text:
        vocab = vocab.union(set(sent))
    return vocab

def common_words_num(vocab, new_model, base_model):
    words_num = 0
    for i in vocab:
        if (i in new_model) or (i in base_model):
            words_num += 1
    return words_num

def get_sentence_embeddings(sentence, new_model, base_model):
    word_embeddings = []
    for word in set(sentence):
        if word in new_model:
            word_emb = list(new_model[word])
            word_embeddings.append(word_emb)
        elif word in base_model:
            word_emb = list(base_model[word]) 
            word_embeddings.append(word_emb)
    return word_embeddings

def get_embeddings_for_paper(paper_text, new_model, base_model):
    sentence_embeddings = []
    for i, sentence in enumerate(paper_text):
        sentence_emb_matrix = np.array(get_sentence_embeddings(sentence, new_model, base_model))
        
        if sentence_emb_matrix.shape[0] == 0:
            sentence_embeddings.append(list(np.random.uniform(-1, 1, 300)))
        else:
            sentence_embeddings.append(list(np.mean(sentence_emb_matrix, axis = 0)))
    
    print((len(sentence_embeddings), len(sentence_embeddings[0])))
    return sentence_embeddings

### Prepairing Test Dataset

In [4]:
dataset = pd.read_csv('../data/papers_dataset.csv')
dataset.keywords = dataset.keywords.apply(ast.literal_eval)
dataset.sections = dataset.sections.apply(ast.literal_eval)

test_df = dataset[dataset.partition == 'test']
print(test_df.shape)

texts = []

for _, row in test_df.iterrows():
    text = row['abstract']
    for i in row['sections'].values():
        text += ' ' + i
    texts.append(text)
    
test_df['text'] = texts
test_df['sentences'] = test_df.text.apply(sent_tokenize)
test_df['sentences'] = test_df['sentences'].apply(lambda x: [sent for sent in x if len(sent.split()) > 3])

(36, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
sentences = test_df.sentences.values

tokenizer = RegexpTokenizer(r'\w+')
sentences_tokenized = [[re.sub(r'\[.*?\]', '', re.sub(r'\d+', '', w)).lower() for w in sent] for sent in sentences]
sentences_tokenized = [[tokenizer.tokenize(i) for i in sent] for sent in sentences_tokenized]
sentences_tokenized = [[[w for w in sentence if not w in stop_words] for sentence in paper] for paper in sentences_tokenized]

tok_test_vocab = set()
for sent in sentences_tokenized:
    for i in sent:
        tok_test_vocab = tok_test_vocab.union(set(i))
print(len(tok_test_vocab))

7004


In [16]:
test_df['sentences_tokenized'] = sentences_tokenized

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
test_df.head()

Unnamed: 0,paper_id,category,name,num_pages,num_formulas,num_figures,title,keywords,abstract,sections,num_sentences,sent_by_page,partition,text,sentences,sentences_tokenized
1,paper_1,Artificial Intelligence,A Model for Clustering Social Media Data for E...,4,0,4,A Model for Clustering Social Media Data for E...,"[Social Media, Twitter Application Programming...","Through Social media, people are able to write...",{'Introduction': 'Clustering is a descriptive ...,76,25.333333,test,"Through Social media, people are able to write...","[Through Social media, people are able to writ...","[[social, media, people, able, write, short, m..."
2,paper_2,Artificial Intelligence,An Intelligent System for Traffic Control in S...,8,5,9,An Intelligent System for Traffic Control in S...,"[Smart Cities, Traffic Congestion, Intelligent...",Current traffic light systems use a fixed time...,"{'Introduction': '', 'Background': 'Traffic co...",99,14.142857,test,Current traffic light systems use a fixed time...,[Current traffic light systems use a fixed tim...,"[[current, traffic, light, systems, use, fixed..."
3,paper_3,Artificial Intelligence,Architecture Trends of Adaptive Educational Hy...,14,0,12,Architecture Trends of Adaptive Educational Hy...,"[Adaptive Educational Hypermedia Systems, Arch...",The aim of this article is to present the gene...,{'Introduction': 'Adaptive Hypermedia Educatio...,203,15.615385,test,The aim of this article is to present the gene...,[The aim of this article is to present the gen...,"[[aim, article, present, general, architecture..."
21,paper_21,Computer Science and Technology,A New Powerful Scheme Based on Self Invertible...,5,1,2,A New Powerful Scheme Based on Self Invertible...,"[Minimum Distance, Minimum Weight, BCH Codes, ...","In this paper, we present the powerful scheme ...",{'Introduction': 'In telecommunication and sto...,60,15.0,test,"In this paper, we present the powerful scheme ...","[In this paper, we present the powerful scheme...","[[paper, present, powerful, scheme, zsismp, zi..."
31,paper_31,Computer Science and Technology,Design and Implementation of Intelligent Medic...,6,2,10,Design and Implementation of Intelligent Medic...,"[Smart Medical Care, ZigBee, Semantic Matching]",With the continuous improvement of human livin...,"{'Introduction': 'From 1990 to 2017, the morbi...",124,24.8,test,With the continuous improvement of human livin...,[With the continuous improvement of human livi...,"[[continuous, improvement, human, living, cond..."


### Reading Word2vec models

In [18]:
base_model = KeyedVectors.load_word2vec_format('../Fine-Tuning-Word2Vec-Embeddings/GoogleNews-vectors-negative300.bin', 
                                               binary=True)
new_model = KeyedVectors.load_word2vec_format("../Fine-Tuning-Word2Vec-Embeddings/new_word2vec.model")

INFO - 03:31:37: loading projection weights from ../Fine-Tuning-Word2Vec-Embeddings/GoogleNews-vectors-negative300.bin
INFO - 03:33:12: KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ../Fine-Tuning-Word2Vec-Embeddings/GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2021-05-10T03:33:12.358454', 'gensim': '4.0.1', 'python': '3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'load_word2vec_format'}
INFO - 03:33:13: loading projection weights from ../Fine-Tuning-Word2Vec-Embeddings/new_word2vec.model
INFO - 03:33:49: KeyedVectors lifecycle event {'msg': 'loaded (29222, 300) matrix of type float32 from ../Fine-Tuning-Word2Vec-Embeddings/new_word2vec.model', 'binary': False, 'encoding': 'utf8', 'datetime': '2021-05-10T03:33:49.386476', 'gensim': '4.0.1', 'python': '3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]', 'platform': '

In [19]:
for paper_text in test_df.sentences_tokenized.values:
    print(len(paper_text))
    get_embeddings_for_paper(paper_text, new_model, base_model)

    paper_vocab = get_paper_vocab(paper_text)
    print(common_words_num(paper_vocab, new_model, base_model)/len(paper_vocab)*100)
    print('-'*30)

76
(76, 300)
100.0
------------------------------
96
(96, 300)
98.00332778702163
------------------------------
202
(202, 300)
97.84172661870504
------------------------------
58
(58, 300)
95.94594594594594
------------------------------
118
(118, 300)
98.34710743801654
------------------------------
182
(182, 300)
97.279792746114
------------------------------
122
(122, 300)
99.79550102249489
------------------------------
108
(108, 300)
79.82062780269058
------------------------------
66
(66, 300)
98.36734693877551
------------------------------
99
(99, 300)
99.00332225913621
------------------------------
45
(45, 300)
96.06299212598425
------------------------------
159
(159, 300)
99.77876106194691
------------------------------
133
(133, 300)
97.31012658227847
------------------------------
118
(118, 300)
98.75346260387812
------------------------------
96
(96, 300)
99.11111111111111
------------------------------
194
(194, 300)
98.49699398797596
------------------------------
48
(

In [20]:
test_df['sentence_embeddings'] = test_df.sentences_tokenized.apply(lambda x: 
                                                                   get_embeddings_for_paper(x, new_model, base_model))

(76, 300)
(96, 300)
(202, 300)
(58, 300)
(118, 300)
(182, 300)
(122, 300)
(108, 300)
(66, 300)
(99, 300)
(45, 300)
(159, 300)
(133, 300)
(118, 300)
(96, 300)
(194, 300)
(48, 300)
(98, 300)
(57, 300)
(145, 300)
(133, 300)
(118, 300)
(110, 300)
(59, 300)
(76, 300)
(143, 300)
(87, 300)
(68, 300)
(71, 300)
(76, 300)
(86, 300)
(168, 300)
(163, 300)
(103, 300)
(175, 300)
(116, 300)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
test_df.head()

Unnamed: 0,paper_id,category,name,num_pages,num_formulas,num_figures,title,keywords,abstract,sections,num_sentences,sent_by_page,partition,text,sentences,sentences_tokenized,sentence_embeddings
1,paper_1,Artificial Intelligence,A Model for Clustering Social Media Data for E...,4,0,4,A Model for Clustering Social Media Data for E...,"[Social Media, Twitter Application Programming...","Through Social media, people are able to write...",{'Introduction': 'Clustering is a descriptive ...,76,25.333333,test,"Through Social media, people are able to write...","[Through Social media, people are able to writ...","[[social, media, people, able, write, short, m...","[[0.25343436, 0.012410111, -0.34438372, 0.3019..."
2,paper_2,Artificial Intelligence,An Intelligent System for Traffic Control in S...,8,5,9,An Intelligent System for Traffic Control in S...,"[Smart Cities, Traffic Congestion, Intelligent...",Current traffic light systems use a fixed time...,"{'Introduction': '', 'Background': 'Traffic co...",99,14.142857,test,Current traffic light systems use a fixed time...,[Current traffic light systems use a fixed tim...,"[[current, traffic, light, systems, use, fixed...","[[0.08013701, -0.043167926, 0.019797344, 0.273..."
3,paper_3,Artificial Intelligence,Architecture Trends of Adaptive Educational Hy...,14,0,12,Architecture Trends of Adaptive Educational Hy...,"[Adaptive Educational Hypermedia Systems, Arch...",The aim of this article is to present the gene...,{'Introduction': 'Adaptive Hypermedia Educatio...,203,15.615385,test,The aim of this article is to present the gene...,[The aim of this article is to present the gen...,"[[aim, article, present, general, architecture...","[[0.13080898, -0.11806064, -0.047516536, 0.097..."
21,paper_21,Computer Science and Technology,A New Powerful Scheme Based on Self Invertible...,5,1,2,A New Powerful Scheme Based on Self Invertible...,"[Minimum Distance, Minimum Weight, BCH Codes, ...","In this paper, we present the powerful scheme ...",{'Introduction': 'In telecommunication and sto...,60,15.0,test,"In this paper, we present the powerful scheme ...","[In this paper, we present the powerful scheme...","[[paper, present, powerful, scheme, zsismp, zi...","[[0.22547568, 0.05606357, -0.1207298, 0.160863..."
31,paper_31,Computer Science and Technology,Design and Implementation of Intelligent Medic...,6,2,10,Design and Implementation of Intelligent Medic...,"[Smart Medical Care, ZigBee, Semantic Matching]",With the continuous improvement of human livin...,"{'Introduction': 'From 1990 to 2017, the morbi...",124,24.8,test,With the continuous improvement of human livin...,[With the continuous improvement of human livi...,"[[continuous, improvement, human, living, cond...","[[-0.21172313, 0.35368863, -0.16834831, 0.2698..."


In [22]:
test_df.to_csv('./test_papers_word2vec.csv', index = False)