In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
from nltk import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm
import random
import json

In [37]:
dataset = pd.read_csv('../data/papers_dataset.csv')
dataset.keywords = dataset.keywords.apply(ast.literal_eval)
dataset.sections = dataset.sections.apply(ast.literal_eval)

with open('../data/categories.json', 'r') as f:
    categ_id = json.load(f)

test_df = dataset[dataset.partition == 'test']

sentences_list = []

for _, row in test_df.iterrows():
    all_sentences = []
    abstract_sents = sent_tokenize(row['abstract'])
    all_sentences += abstract_sents
    
    for section in row['sections'].values():
        section_sents = sent_tokenize(section)
        all_sentences += section_sents
        
    sentences_list.append(all_sentences)
    
test_df['sentences'] = sentences_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
data_parts = []
for i in tqdm(range(5)):
    data = pd.read_csv(f'../data/sentence_embeddings_{(i+1)*100}.csv')
    data_parts.append(data)
    
data_df = pd.concat(data_parts, axis = 0)
data_df.sent_tr_emb = data_df.sent_tr_emb.apply(lambda x: list(np.fromstring(x[1:-1],sep=' ').astype(float)))
data_df['categ_id'] = [categ_id[i] for i in data_df.category.values]

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.92s/it]


In [12]:
data_df.shape

(68777, 5)

In [13]:
data_df.head()

Unnamed: 0,title,category,sentence,categ_id,sent_tr_emb
0,A Comprehensive Review on Heart Disease Predic...,Artificial Intelligence,Heart disease is one of the major causes of li...,0,"[0.4460226, -0.845141947, 0.146280348, -1.1929..."
1,A Comprehensive Review on Heart Disease Predic...,Artificial Intelligence,The heart disease diagnosis and treatment are ...,0,"[-0.18601216, -0.037789, 0.09992175, 0.1720532..."
2,A Comprehensive Review on Heart Disease Predic...,Artificial Intelligence,"Inadequate preventive measures, lack of experi...",0,"[-0.231186911, -0.189537778, 0.0427042805, -0...."
3,A Comprehensive Review on Heart Disease Predic...,Artificial Intelligence,"Although, large proportion of heart diseases i...",0,"[0.16552189, -0.309478015, 0.113730304, -0.397..."
4,A Comprehensive Review on Heart Disease Predic...,Artificial Intelligence,"In today's digital world, several clinical dec...",0,"[0.10223487, -0.170204878, 0.340327203, -0.030..."


In [17]:
test_papers = set(test_df.title.values)
len(test_papers)

36

In [19]:
data_df = data_df[data_df.title.isin(test_papers)]
data_df.shape

(4173, 5)

In [22]:
len(data_df.title.value_counts())

36

In [24]:
type(data_df.sent_tr_emb.iloc[0])

list

In [25]:
sent_emb_dict = dict(zip(data_df.sentence.values, data_df.sent_tr_emb.values))

In [None]:
sentence_embeddings = []

def get_sent_transf_emb(paper_text, emb_df):
    sentence_embeddings = [sent_emb_dict[sentence] for sentence in paper_text]

In [38]:
test_df['sentence_embeddings'] = test_df.sentences.apply(lambda x: [sent_emb_dict[sentence] for sentence in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
test_df

Unnamed: 0,paper_id,category,name,num_pages,num_formulas,num_figures,title,keywords,abstract,sections,num_sentences,sent_by_page,partition,sentences,sentence_embeddings
1,paper_1,Artificial Intelligence,A Model for Clustering Social Media Data for E...,4,0,4,A Model for Clustering Social Media Data for E...,"[Social Media, Twitter Application Programming...","Through Social media, people are able to write...",{'Introduction': 'Clustering is a descriptive ...,76,25.333333,test,"[Through Social media, people are able to writ...","[[0.992317319, 0.0893145874, -0.340903342, -0...."
2,paper_2,Artificial Intelligence,An Intelligent System for Traffic Control in S...,8,5,9,An Intelligent System for Traffic Control in S...,"[Smart Cities, Traffic Congestion, Intelligent...",Current traffic light systems use a fixed time...,"{'Introduction': '', 'Background': 'Traffic co...",99,14.142857,test,[Current traffic light systems use a fixed tim...,"[[-0.375607759, -0.0790585652, 0.888247252, -0..."
3,paper_3,Artificial Intelligence,Architecture Trends of Adaptive Educational Hy...,14,0,12,Architecture Trends of Adaptive Educational Hy...,"[Adaptive Educational Hypermedia Systems, Arch...",The aim of this article is to present the gene...,{'Introduction': 'Adaptive Hypermedia Educatio...,203,15.615385,test,[The aim of this article is to present the gen...,"[[-0.564446807, 0.879449368, 0.720221996, -0.5..."
21,paper_21,Computer Science and Technology,A New Powerful Scheme Based on Self Invertible...,5,1,2,A New Powerful Scheme Based on Self Invertible...,"[Minimum Distance, Minimum Weight, BCH Codes, ...","In this paper, we present the powerful scheme ...",{'Introduction': 'In telecommunication and sto...,60,15.0,test,"[In this paper, we present the powerful scheme...","[[-0.848686039, 0.408086479, 0.431638807, 0.30..."
31,paper_31,Computer Science and Technology,Design and Implementation of Intelligent Medic...,6,2,10,Design and Implementation of Intelligent Medic...,"[Smart Medical Care, ZigBee, Semantic Matching]",With the continuous improvement of human livin...,"{'Introduction': 'From 1990 to 2017, the morbi...",124,24.8,test,[With the continuous improvement of human livi...,"[[0.154241785, 0.48725009, 0.497481078, -1.171..."
38,paper_38,Computer Science and Technology,Predicting Students’ First-Year Academic Perfo...,13,1,11,Predicting Students' First-Year Academic Perfo...,"[Ordinary Level, Unified Tertiary Matriculatio...",The study aimed to determine if any of the ent...,{'Introduction': 'Education is an essential is...,186,15.5,test,[The study aimed to determine if any of the en...,"[[0.0349662565, 0.540855408, 0.477420688, -0.3..."
57,paper_57,Control and Intelligent Systems,Ant Colony Optimization with Genetic Operation...,6,6,15,Ant colony optimization with genetic operations,"[Ant Colony Optimization, Genetic Operations, ...",This paper attempts to overcome stagnation pro...,{'Introduction': 'Wide range of problems like ...,131,26.2,test,[This paper attempts to overcome stagnation pr...,"[[-0.762145102, 0.363119364, 0.133871302, -0.7..."
78,paper_78,Control and Intelligent Systems,Fuzzy C-means Clustering Applied to the Classi...,5,8,2,Fuzzy C-means Clustering Applied to the Classi...,"[Fuzzy Set Theory, Numerical Classification, L...",Fuzzy C-means clustering is a soft technique a...,"{'Introduction': 'Quantitative methods, such a...",126,31.5,test,[Fuzzy C-means clustering is a soft technique ...,"[[-1.2003119, -0.327789366, 0.26096487, -0.069..."
96,paper_96,Control and Intelligent Systems,On the Warning System of Obstacle Avoidance of...,4,0,1,On the warning system of obstacle avoidance of...,"[Embedded System, Electronic Guide Dog, Avoidi...",With the rapid development of China's transpor...,"{'Introduction': 'During last summer, I went b...",68,22.666667,test,[With the rapid development of China's transpo...,"[[-0.356183827, -0.418222368, 0.153359815, -0...."
134,paper_134,Data Mining and Knowledge Discovery,Data Mining Technique Used in Order to Analysi...,6,0,7,Data Mining Technique Used in Order to Analysi...,"[Data Mining, Regression, Olive Oil, Authentic...","Data mining, also referred to as knowledge ext...",{'Introduction': 'New information and communic...,103,20.6,test,"[Data mining, also referred to as knowledge ex...","[[-0.0839086249, 0.508139968, 0.204696253, 0.5..."


In [42]:
test_df.to_csv('./test_papers_sent_transf.csv', index = False)