In [1]:
import numpy as np
import pandas as pd

target_name_dict = { 'astro-ph.GA' : 0,
                    'astro-ph.SR' : 1,
                    'astro-ph.IM' : 2,
                    'astro-ph.EP' : 3,
                    'astro-ph.HE' : 4,
                    'astro-ph.CO' : 5
                }

label2target = { v:k for k,v in target_name_dict.items()}

In [2]:
!pwd

/home/local/EC/andreas.merentitis/Sagemaker/Mygit/arxiv_explore/exploration/data_check


In [3]:
df = pd.HDFStore("../../data/2015astroph.h5", "r")
df['/df'].keys()
abstracts = df['/df']['abstract']
labels = np.array(df['/df']['categories'])
df.close()

In [4]:
j = np.random.randint(len(labels))
print(j, target_name_dict[labels[j][0]])
print(abstracts[j])

44745 0
Gravitationally lensed (GL) quasars are brighter than their unlensed
counterparts and produce images with distinctive morphological signatures. Past
searches and target selection algorithms, in particular the Sloan Quasar Lens
Search (SQLS), have relied on basic morphological criteria, which were applied
to samples of bright, spectroscopically confirmed quasars. The SQLS techniques
are not sufficient for searching into new surveys (e.g. DES, PS1, LSST),
because spectroscopic information is not readily available and the large data
volume requires higher purity in target/candidate selection. We carry out a
systematic exploration of machine learning techniques and demonstrate that a
two step strategy can be highly effective. In the first step we use
catalog-level information ($griz$+WISE magnitudes, second moments) to preselect
targets, using artificial neural networks. The accepted targets are then
inspected with pixel-by-pixel pattern recognition algorithms (Gradient-Boosted
Tre

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [6]:
# settings
maxlen = 150
max_words = 10000 # Top 10000 words
training_samples = 6000 #

In [7]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(abstracts)
sequences = tokenizer.texts_to_sequences(abstracts)
word_index = tokenizer.word_index

print("Found %s unique tokens" % len(word_index))

word_index_reverse = dict()

for k, v in word_index.items():
    word_index_reverse[v] = k

Found 78157 unique tokens


In [8]:
data = pad_sequences(sequences=sequences, maxlen=maxlen)
indices = np.arange(abstracts.shape[0])

np.random.seed(1234)

np.random.shuffle(indices)

data = data[indices]

labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]

x_test = data[training_samples:]
y_test = labels[training_samples:]

In [9]:
#https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 
my_texts = list(map(sequence_to_text, data))

# alternative way
my_texts_2 = tokenizer.sequences_to_texts(sequences=sequences)

In [10]:
j = np.random.randint(len(x_train))
print(j, target_name_dict[y_train[j][0]])
print(my_texts[j])

1221 0
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'we', 'report', 'the', 'discovery', 'of', 'a', 'narrow', 'stellar', 'stream', 'crossing', 'the', 'of', 'sculptor', 'and', 'fornax', 'in', 'the', 'southern', 'celestial', 'hemisphere', 'the', 'portion', 'of', 'the', 'stream', 'detected', 'in', 'the', 'data', 'release', '1', 'photometry', 'of', 'the', 'atlas', 'survey', 'is', 'at', 'least', '12', 'degrees', 'long', 'while', 'its', 'width', 'is', 'approx', '0', '25', 'deg', 'the', 'color', 'magnitude', 'diagram', 'of', 'this', 'halo', 'sub', 'structure', 'is', 'consistent', 'with', 'a', 'metal', 'poor', 'fe', 'h', 'lesssim', '1', '4', 'stellar', 'population', 'located', 'at', 'a', 'heliocentric', 'distance', 'of', '20', 'pm', '2', 'kpc', 'there', 'are', 'three', 'globular', 'clusters', 'that', 'could', 'ten

In [11]:
y_train = np.asarray([item[0] for item in y_train.tolist()])
y_train

array(['astro-ph.SR', 'astro-ph.EP', 'astro-ph.EP', ..., 'astro-ph.CO',
       'astro-ph.GA', 'astro-ph.HE'], dtype='<U18')

In [12]:
y_test = np.asarray([item[0] for item in y_test.tolist()])
y_test

array(['astro-ph.SR', 'astro-ph.SR', 'astro-ph.CO', ..., 'astro-ph.HE',
       'astro-ph', 'gr-qc'], dtype='<U18')

In [13]:
unique_labels = np.unique(y_train)
unique_labels

array(['astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA',
       'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.mtrl-sci',
       'cond-mat.quant-gas', 'cond-mat.stat-mech', 'cs.CV', 'cs.DB',
       'gr-qc', 'hep-ex', 'hep-ph', 'hep-th', 'math-ph', 'math.CA',
       'math.NA', 'math.OC', 'nucl-ex', 'nucl-th', 'physics.ao-ph',
       'physics.atom-ph', 'physics.chem-ph', 'physics.comp-ph',
       'physics.data-an', 'physics.ed-ph', 'physics.flu-dyn',
       'physics.gen-ph', 'physics.geo-ph', 'physics.hist-ph',
       'physics.ins-det', 'physics.med-ph', 'physics.optics',
       'physics.plasm-ph', 'physics.soc-ph', 'physics.space-ph',
       'quant-ph'], dtype='<U18')

In [14]:
selected_labels = ['astro-ph.GA', 'astro-ph.SR', 'astro-ph.IM', 'astro-ph.EP', 'astro-ph.HE', 'astro-ph.CO' ]

In [15]:
y_train_selected = np.asarray([item for item in y_train.tolist() if item in selected_labels])
y_train_selected

array(['astro-ph.SR', 'astro-ph.EP', 'astro-ph.EP', ..., 'astro-ph.CO',
       'astro-ph.GA', 'astro-ph.HE'], dtype='<U11')

In [16]:
jj = 0 
x_train_selected = []

for item in y_train.tolist(): 
    if item in selected_labels:
        x_train_selected.append(x_train[jj,:])
        jj = jj + 1
        
x_train_selected = np.asarray(x_train_selected)

In [17]:
y_test_selected = np.asarray([item for item in y_test.tolist() if item in selected_labels])
y_test_selected

array(['astro-ph.SR', 'astro-ph.SR', 'astro-ph.CO', ..., 'astro-ph.CO',
       'astro-ph.CO', 'astro-ph.HE'], dtype='<U11')

In [18]:
jj = 0 
x_test_selected = []

for item in y_test.tolist(): 
    if item in selected_labels:
        x_test_selected.append(x_test[jj,:])
        jj = jj + 1
        
x_test_selected = np.asarray(x_test_selected)

In [19]:
y_train_num = np.asarray([target_name_dict[x] for x in y_train_selected.tolist()])
y_test_num = np.asarray([target_name_dict[x] for x in y_test_selected.tolist()])

In [20]:
from keras.utils.np_utils import to_categorical
y_train_one_hot = to_categorical(y_train_num)
y_test_one_hot = to_categorical(y_test_num)

In [21]:
from keras.models import Sequential
import keras.layers as layers

In [22]:
model = Sequential()
embeddings_dim = 100
model.add(layers.Embedding(max_words, embeddings_dim, input_length=maxlen))
model.add(layers.Flatten())
#model.add(layers.Dense(64, activation='relu', input_shape=(maxlen,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(rate=0.3))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(6, activation='softmax'))
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                480032    
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 198       
Total params: 1,481,286
Trainable param

In [23]:
#model.layers[0].set_weights([embedding_matrix])
#model.layers[0].trainable= False

In [24]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train_selected, y_train_one_hot,
                   epochs=5,
                   batch_size=32,
                   validation_split=0.3)
#model.save_weights('pre_trained_glove_model_2.h5')


Train on 3608 samples, validate on 1547 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
results = model.evaluate(x_test_selected, y_test_one_hot)
class_prediction = model.predict_classes(x_test_selected)



In [26]:
results

[3.022110013562863, 0.23108041286468506]

In [27]:
jj = np.random.randint(len(x_test_selected))
print(x_test_selected[jj])
print(my_texts[training_samples+jj])
print("true class: ", target_name_dict[y_test_selected[jj]])
print("prediction: ", class_prediction[jj])

[   1 4740  182   48    3  603 4414    1  382  633   15 3972    3  603
 4414   12 2381 1354  220    3 2201 1354  220  424    7   45   10    4
  175  159 2143 8024   15   96 2747   11   78   41 1042    2    1   70
    3    1  159 2143 1048 6165 1108   41  557 1751  983   22  927  876
  619    1  159 1024    8  249   15  213   41  207    1  382  291  801
  143   21 4414   75   15 5096 8290    7   45   10    1  159 2143    8
 1460  165    1  291  801    8  213    3    1  159 1024 1206 4998   15
    4  850  291  801   11  285 8290    1  175  159 2143    3   67  291
  801   60    5    4 2535    3  876 1172 1256    9  432  775  631  437
   54  753   10 5577    4  467   20   22 1483 1990   11 1452   42   96
    1   38   14    1 4289 4210   12  434   11 7921]
['the', 'snow', 'surface', 'between', 'and', '14', '5m', 'the', 'average', 'temperatures', 'at', '2m', 'and', '14', '5m', 'are', '54', 'circ', 'c', 'and', '46', 'circ', 'c', 'respectively', 'we', 'find', 'that', 'a', 'strong', 'temperatur

In [28]:
abstract_testing = "this is a new extrasolar system"
seq_testing = tokenizer.texts_to_sequences([[ w for w in abstract_testing.split(' ')]])
data_testing = pad_sequences(sequences=seq_testing, maxlen=maxlen)
print(data_testing)
classes_testing = model.predict(data_testing)
print("prediction: ", classes_testing[0])
print("predicted category: ", label2target[np.argmax(classes_testing[0])])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0   17    8    4   85 2232  133]]
prediction:  [0.05442783 0.31576627 0.0355003  0.04861952 0.3085935  0.23709252]
predicted category:  astro-ph.SR


In [29]:
model.save('my_model.h5')