In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import textblob

In [2]:
print("Loading data...")
train = pd.read_csv("../input/innoplexusav/train.csv")
print("Train shape:", train.shape)
test = pd.read_csv("../input/innoplexusav/test.csv")
print("Test shape:", test.shape)

Loading data...
Train shape: (5279, 4)
Test shape: (2924, 3)


In [3]:
train.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer()
cv1.fit(train["text"])

cv2 = CountVectorizer()
cv2.fit(test["text"])

print("Train Set Vocabulary Size:", len(cv1.vocabulary_))
print("Test Set Vocabulary Size:", len(cv2.vocabulary_))
print("Number of Words that occur in both:", len(set(cv1.vocabulary_.keys()).intersection(set(cv2.vocabulary_.keys()))))

Train Set Vocabulary Size: 41015
Test Set Vocabulary Size: 31570
Number of Words that occur in both: 24625


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
columns = ['drug']

def encoder(df):
    for col in columns:
        label_encoder = LabelEncoder()
        label_encoder.fit(df[col])
        df[col] = label_encoder.transform(df[col])
    return df

In [7]:
train = encoder(train)
test = encoder(test)

In [8]:
def transform(df):
    df["word_count"] = df["text"].apply(lambda x: len(x.split()))
    df["has_upper"] = df["text"].apply(lambda x: x.lower() != x)
    df["sentence_end"] = df["text"].apply(lambda x: x.endswith("."))
    df["text"] = df["text"].apply(lambda x: x.lower())
    return df

train = transform(train)
test = transform(test)

dense_features = ["drug"]

#train.groupby("Sentiment")[dense_features].mean()

In [9]:
EMBEDDING_FILE = "../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt"
EMBEDDING_DIM = 100

all_words = set(cv1.vocabulary_.keys()).union(set(cv2.vocabulary_.keys()))

def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE)
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in all_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index

embeddings_index = get_embedding()
print("Number of words that don't exist in GLOVE:", len(all_words - set(embeddings_index)))

Number of words that don't exist in GLOVE: 13381


In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 60

tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.append(train["text"].values, test["text"].values))
word_index = tokenizer.word_index

nb_words = len(word_index) + 1
embedding_matrix = np.random.rand(nb_words, EMBEDDING_DIM + 2)

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    sent = textblob.TextBlob(word).sentiment
    if embedding_vector is not None:
        embedding_matrix[i] = np.append(embedding_vector, [sent.polarity, sent.subjectivity])
    else:
        embedding_matrix[i, -2:] = [sent.polarity, sent.subjectivity]
        
seq = pad_sequences(tokenizer.texts_to_sequences(train["text"]), maxlen=MAX_SEQUENCE_LENGTH)
test_seq = pad_sequences(tokenizer.texts_to_sequences(test["text"]), maxlen=MAX_SEQUENCE_LENGTH)

Using TensorFlow backend.


In [11]:
from keras.layers import *
from keras.models import Model
from keras.callbacks import EarlyStopping

def build_model():
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM + 2,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    dropout = SpatialDropout1D(0.2)
    mask_layer = Masking()
    lstm_layer = LSTM(50)
    
    seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    dense_input = Input(shape=(len(dense_features),))
    
    dense_vector = BatchNormalization()(dense_input)
    
    phrase_vector = lstm_layer(mask_layer(dropout(embedding_layer(seq_input))))
    
    feature_vector = concatenate([phrase_vector, dense_vector])
    feature_vector = Dense(50, activation="relu")(feature_vector)
    feature_vector = Dense(20, activation="relu")(feature_vector)
    
    output = Dense(3, activation="softmax")(feature_vector)
    
    model = Model(inputs=[seq_input, dense_input], outputs=output)
    return model

In [12]:
train = train.reset_index()
test = test.reset_index()

In [13]:
NUM_FOLDS = 3
train["fold_id"] = train["index"].apply(lambda x: x%NUM_FOLDS)

In [14]:
enc = OneHotEncoder(sparse=False)
enc.fit(train["sentiment"].values.reshape(-1, 1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

In [15]:
test_preds = np.zeros((test.shape[0], 3))

for i in range(NUM_FOLDS):
    print("FOLD", i+1)
    
    print("Splitting the data into train and validation...")
    train_seq, val_seq = seq[train["fold_id"] != i], seq[train["fold_id"] == i]
    train_dense, val_dense = train[train["fold_id"] != i][dense_features], train[train["fold_id"] == i][dense_features]
    y_train = enc.transform(train[train["fold_id"] != i]["sentiment"].values.reshape(-1, 1))
    y_val = enc.transform(train[train["fold_id"] == i]["sentiment"].values.reshape(-1, 1))
    
    print("Building the model...")
    model = build_model()
    model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["acc"])
    
    early_stopping = EarlyStopping(monitor="val_acc", patience=2, verbose=1)
    
    print("Training the model...")
    model.fit([train_seq, train_dense], y_train, validation_data=([val_seq, val_dense], y_val),
              epochs=15, batch_size=128, shuffle=True, callbacks=[early_stopping], verbose=1)
    
    print("Predicting...")
    test_preds += model.predict([test_seq, test[dense_features]], batch_size=128, verbose=1)
    print()
    
test_preds /= NUM_FOLDS

FOLD 1
Splitting the data into train and validation...
Building the model...
Training the model...
Train on 3519 samples, validate on 1760 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
Predicting...

FOLD 2
Splitting the data into train and validation...
Building the model...
Training the model...
Train on 3519 samples, validate on 1760 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
Predicting...

FOLD 3
Splitting the data into train and validation...
Building the model...
Training the model...
Train on 3520 samples, validate on 1759 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
Predicting...



In [16]:
test_preds

array([[0.07135761, 0.08300569, 0.84563673],
       [0.13283553, 0.25722692, 0.60993755],
       [0.04008221, 0.03586726, 0.92405053],
       ...,
       [0.17895204, 0.14232567, 0.6787223 ],
       [0.08008266, 0.06767414, 0.85224319],
       [0.10215181, 0.27115242, 0.62669577]])

In [17]:
test["sentiment"] = test_preds.argmax(axis=1)

In [18]:
train['sentiment'].value_counts()

2    3825
1     837
0     617
Name: sentiment, dtype: int64

In [19]:
test["sentiment"] = test["sentiment"].astype(int)
test[["unique_hash", "sentiment"]].to_csv("submission.csv", index=False)

In [20]:
print("Select the class with the highest probability as prediction...")
test["pred"] = test_preds.argmax(axis=1)

print("Use these predictions for the phrases which don't exist in train set...")
test.loc[test["sentiment"].isnull(), "sentiment"] = test.loc[test["sentiment"].isnull(), "pred"]

print("Make the submission ready...")
#test["sentiment"] = test["sentiment"].astype(int)
#test[["unique_hash", "sentiment"]].to_csv("submission.csv", index=False)

Select the class with the highest probability as prediction...
Use these predictions for the phrases which don't exist in train set...
Make the submission ready...
