In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.layers import Embedding, Input, Dense, LSTM
from keras.callbacks import ModelCheckpoint

In [None]:
train_data = pd.read_csv('internship/TRAIN.csv')
train_data.head()

Unnamed: 0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,labels
0,ChemGAN challenge for drug discovery: can AI r...,Generating molecules with desired chemical p...,1,0,0,1,0,0,"Computer Science,Statistics"
1,Hybrid graphene tunneling photoconductor with ...,Hybrid graphene photoconductor/phototransist...,0,1,0,0,0,0,Physics
2,Temperature Dependence of Magnetic Excitations...,When an ordered spin system of a given dimen...,0,1,0,0,0,0,Physics
3,A Las Vegas algorithm to solve the elliptic cu...,"In this paper, we describe a new Las Vegas a...",1,0,1,0,0,0,"Computer Science,Mathematics"
4,Comparing simulations and test data of a radia...,The VIS instrument on board the Euclid missi...,0,1,0,0,0,0,Physics


In [None]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = sen.lower()
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [None]:
train_data['TITLE'] = train_data['TITLE'].apply(lambda x: preprocess_text(x))
train_data['ABSTRACT'] = train_data['ABSTRACT'].apply(lambda x: preprocess_text(x))

In [None]:
train_data.head()

Unnamed: 0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,labels
0,chemgan challenge for drug discovery can ai re...,generating molecules with desired chemical pr...,1,0,0,1,0,0,"Computer Science,Statistics"
1,hybrid graphene tunneling photoconductor with ...,hybrid graphene photoconductor phototransisto...,0,1,0,0,0,0,Physics
2,temperature dependence of magnetic excitations...,when an ordered spin system of given dimensio...,0,1,0,0,0,0,Physics
3,a las vegas algorithm to solve the elliptic cu...,in this paper we describe new las vegas algor...,1,0,1,0,0,0,"Computer Science,Mathematics"
4,comparing simulations and test data of radiati...,the vis instrument on board the euclid missio...,0,1,0,0,0,0,Physics


In [None]:
y = train_data[train_data.columns[2:]]
y.head()

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,labels
0,1,0,0,1,0,0,"Computer Science,Statistics"
1,0,1,0,0,0,0,Physics
2,0,1,0,0,0,0,Physics
3,1,0,1,0,0,0,"Computer Science,Mathematics"
4,0,1,0,0,0,0,Physics


In [None]:
train_data.drop(columns = train_data.columns[2:], inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.20, random_state=42)

In [None]:
# First output
y1_train = y_train[["Computer Science"]].values
y1_test =  y_test[["Computer Science"]].values

# Second output
y2_train = y_train[["Physics"]].values
y2_test =  y_test[["Physics"]].values

# Third output
y3_train = y_train[["Mathematics"]].values
y3_test =  y_test[["Mathematics"]].values

# Fourth output
y4_train = y_train[["Statistics"]].values
y4_test =  y_test[["Statistics"]].values

# Fifth output
y5_train = y_train[["Quantitative Biology"]].values
y5_test =  y_test[["Quantitative Biology"]].values

# Sixth output
y6_train = y_train[["Quantitative Finance"]].values
y6_test =  y_test[["Quantitative Finance"]].values

In [None]:
tokenizer_title = Tokenizer(num_words=50)
tokenizer_abs = Tokenizer(num_words=50)

tokenizer_title.fit_on_texts(X_train['TITLE'])
tokenizer_abs.fit_on_texts(X_train['ABSTRACT'])

X_train_title = tokenizer_title.texts_to_sequences(X_train['TITLE'])
X_test_title = tokenizer_title.texts_to_sequences(X_test['TITLE'])

X_train_abs = tokenizer_abs.texts_to_sequences(X_train['ABSTRACT'])
X_test_abs = tokenizer_abs.texts_to_sequences(X_test['ABSTRACT'])

title_vocab_size = len(tokenizer_title.word_index) + 1
asb_vocab_size = len(tokenizer_abs.word_index) + 1

max_features = 10000
maxlen = 100

X_train_title = pad_sequences(X_train_title, maxlen=maxlen)
X_test_title = pad_sequences(X_test_title, maxlen=maxlen)

X_train_abs = pad_sequences(X_train_abs, maxlen= maxlen)
X_test_abs = pad_sequences(X_test_abs, maxlen = maxlen)

In [None]:
X_train_abs.shape

(13600, 100)

In [None]:
embeddings_index = {}
with open('glove.6B.100d.txt', encoding = 'utf8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
num_tokens = title_vocab_size
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 2 words (0 misses)


In [None]:
input_1 = Input(shape=(maxlen,))
embedding_layer = Embedding(title_vocab_size, 100, weights=[embedding_matrix], trainable=False)(input_1)
LSTM_Layer1 = LSTM(128)(embedding_layer)

output1 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output2 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output3 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output4 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output5 = Dense(1, activation='sigmoid')(LSTM_Layer1)
output6 = Dense(1, activation='sigmoid')(LSTM_Layer1)

model = Model(inputs=input_1, outputs=[output1, output2, output3, output4, output5, output6])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
print(model.summary())

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 100)     1488400     input_4[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 128)          117248      embedding_3[0][0]                
__________________________________________________________________________________________________
dense_6 (Dense)                 (None, 1)            129         lstm_1[0][0]                     
_______________________________________________________________________________________

In [None]:
callbacks = [ModelCheckpoint('internship/model.h5', save_best_only=True, save_weights_only=False)]

In [None]:
history = model.fit(x=X_train_title, y=[y1_train, y2_train, y3_train, y4_train, y5_train, y6_train], batch_size=128, epochs=5, verbose=1, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
score = model.evaluate(x=X_test_title, y=[y1_test, y2_test, y3_test, y4_test, y5_test, y6_test], verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 2.567455291748047
Test Accuracy: 0.6599200367927551


In [None]:
# test data
test_data = pd.read_csv('internship/TEST.csv')

In [None]:
test_data['TITLE'] = test_data['TITLE'].apply(lambda x: preprocess_text(x))
test_data['ABSTRACT'] = test_data['ABSTRACT'].apply(lambda x: preprocess_text(x))

In [None]:
test_title = tokenizer_title.texts_to_sequences(test_data.TITLE)
test_title = pad_sequences(test_title, maxlen=maxlen)

In [None]:
prediction = model.predict(test_title)

In [None]:
pred_df = pd.DataFrame(data = prediction[0])
for i in range(1,6,1):
    pred_df = pd.concat([pred_df, pd.DataFrame(prediction[i])], axis=1)
pred_df.columns = y.columns[:-1]

In [None]:
pred_df

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,0.270186,0.430124,0.267226,0.191431,0.029509,0.013025
1,0.303598,0.302630,0.364836,0.180323,0.028336,0.012648
2,0.290876,0.367787,0.332572,0.160505,0.023617,0.012312
3,0.476332,0.214555,0.249154,0.320579,0.019058,0.013845
4,0.270186,0.430124,0.267226,0.191431,0.029509,0.013025
...,...,...,...,...,...,...
3967,0.213361,0.290097,0.488704,0.102248,0.027390,0.011009
3968,0.321736,0.380576,0.265619,0.223622,0.026535,0.013542
3969,0.231982,0.325794,0.408305,0.132549,0.032461,0.012412
3970,0.321736,0.380576,0.265619,0.223622,0.026535,0.013542


In [None]:
labels = y.columns[:-1]

pred_df['labels'] = [labels[np.argmax(pred_df.iloc[i])] for i in range(pred_df.shape[0])]

In [None]:
pred_df

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,labels
0,0.270186,0.430124,0.267226,0.191431,0.029509,0.013025,Physics
1,0.303598,0.302630,0.364836,0.180323,0.028336,0.012648,Mathematics
2,0.290876,0.367787,0.332572,0.160505,0.023617,0.012312,Physics
3,0.476332,0.214555,0.249154,0.320579,0.019058,0.013845,Computer Science
4,0.270186,0.430124,0.267226,0.191431,0.029509,0.013025,Physics
...,...,...,...,...,...,...,...
3967,0.213361,0.290097,0.488704,0.102248,0.027390,0.011009,Mathematics
3968,0.321736,0.380576,0.265619,0.223622,0.026535,0.013542,Physics
3969,0.231982,0.325794,0.408305,0.132549,0.032461,0.012412,Mathematics
3970,0.321736,0.380576,0.265619,0.223622,0.026535,0.013542,Physics


In [None]:
test_data = pd.read_csv('internship/TEST.csv')
new_df = pd.DataFrame()
new_df = pd.concat([test_data, pred_df['labels']], axis = 1)
new_df.to_csv('internship/submission.csv', index = False)