In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [None]:
import nltk
nltk.data.path.append('/kaggle/working')
nltk.download("wordnet", download_dir='/kaggle/working')

import os
os.environ['NLTK_DATA'] = '/kaggle/working'

!mkdir /kaggle/working/corpora/wordnet
!unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora

In [14]:
train_file_path = "/kaggle/input/nlp-getting-started/train.csv"
train_data = pd.read_csv(train_file_path)

print(train_data.columns)

test_file_path = "/kaggle/input/nlp-getting-started/test.csv"
test_data = pd.read_csv(test_file_path)

print(test_data.columns)

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [39]:
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Class responsible for handling the input data
class Dataset:
    train_data = None
    learn_data = None
    assess_data = None
    test_data = None
    combined_data = None
    
    target_data = None
    
    combined_encoded_text = None
    train_encoded_text = None
    test_encoded_text = None
    learn_encoded_text = None
    assess_encoded_text = None
    
    combined_sequences = None
    train_sequences = None
    test_sequences = None
    
    vocab_size = None
    tokenizer = None
    max_sequence_length = 28
        
    # data is pandas DataFrame
    def __init__(self, train_data, test_data, learn_ratio, assess_ratio):
        assert learn_ratio + assess_ratio == 1, \
        "The sum of learn_ratio and assess_ratio should be equal to 1"
        
        self.train_data = train_data
        self.test_data = test_data
        
        self.target_data = self.train_data.iloc[:, -1]
        self.combined_data = pd.concat([self.train_data.iloc[:, :-1], self.test_data])
        
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.combined_data['text'])
        self.vocab_size = len(self.tokenizer.word_index) + 1
        self.combined_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.combined_data['text']), 
            maxlen=self.max_sequence_length, 
            padding='post')
        self.train_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.train_data['text']),
            maxlen=self.max_sequence_length,
            padding='post')
        self.test_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.test_data['text']),
            maxlen=self.max_sequence_length,
            padding='post')

        
        learn_len = int(learn_ratio * len(self.train_data))
        self.learn_data  = self.train_data.iloc[:learn_len]
        self.assess_data = self.train_data.iloc[learn_len:]
        
        return
        
        encoded_documents = self.tfidf_encode_documents(self.combined_data['text'])
        self.combined_encoded_text = pd.DataFrame.sparse.from_spmatrix(encoded_documents)
        self.train_encoded_text = self.combined_encoded_text.iloc[:len(self.train_data)]
        self.test_encoded_text = self.combined_encoded_text.iloc[len(self.train_data):]
        self.learn_data_encoded_text = self.train_encoded_text[:learn_len]
        self.assess_data_encoded_text = self.train_encoded_text[learn_len:]

        
    def lemmatize(self, text):
        words = re.findall(r'\w+', text.lower())
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in words]
    
    def tfidf_encode_documents(self, documents):
        vectorizer = TfidfVectorizer(tokenizer = self.lemmatize)
        return vectorizer.fit_transform(documents)
        

In [40]:
dataset = Dataset(train_data, test_data, 0.8, 0.2)

assert len(dataset.learn_data) + len(dataset.assess_data) == len(dataset.train_data), \
"The size of the learn_data and assess_data should total to the size of data"

print("Learn dataset len={}".format(len(dataset.learn_data)))
print("Assess dataset len={}".format(len(dataset.assess_data)))
print("Total dataset len={}".format(len(dataset.train_data)))

Learn dataset len=6090
Assess dataset len=1523
Total dataset len=7613


In [None]:
# Neural Network model

import tensorflow as tf
from tensorflow import keras

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

input_size = dataset.train_encoded_text.shape[1]
hidden_layer_size = 10

model = keras.models.Sequential()
model.add(keras.layers.Dense(hidden_layer_size, activation='swish', input_shape=(input_size,)))
model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(dataset.train_encoded_text, dataset.target_data, epochs = 50)

In [1]:
# Download the word embeddings
!curl -o /kaggle/working/glove.twitter.27B.zip https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1449M  100 1449M    0     0  5204k      0  0:04:45  0:04:44  0:00:01 5118k0:03:55 4845k  0  0:04:39  0:01:40  0:02:59 5238k04:42  0:02:47  0:01:55 5131k:04:43  0:03:23  0:01:20 5116k04:45  0:04:45 --:--:-- 5126k


In [2]:
# Unzip the downloaded word embeddings
!unzip /kaggle/working/glove.twitter.27B.zip -d /kaggle/working
!ls -alt /kaggle/working

Archive:  /kaggle/working/glove.twitter.27B.zip
  inflating: /kaggle/working/glove.twitter.27B.25d.txt  
  inflating: /kaggle/working/glove.twitter.27B.50d.txt  
  inflating: /kaggle/working/glove.twitter.27B.100d.txt  
  inflating: /kaggle/working/glove.twitter.27B.200d.txt  
total 5242480
drwxr-xr-x 3 root root       4096 Jun 21 20:45 .
-rw-r--r-- 1 root root 1520408563 Jun 21 20:34 glove.twitter.27B.zip
drwxr-xr-x 2 root root       4096 Jun 21 20:29 .virtual_documents
drwxr-xr-x 5 root root       4096 Jun 21 20:29 ..
---------- 1 root root        263 Jun 21 20:29 __notebook_source__.ipynb
-rw-rw-r-- 1 root root 2057590469 Aug 14  2014 glove.twitter.27B.200d.txt
-rw-rw-r-- 1 root root 1021669379 Aug 14  2014 glove.twitter.27B.100d.txt
-rw-rw-r-- 1 root root  510887943 Aug 14  2014 glove.twitter.27B.50d.txt
-r--r--r-- 1 root root  257699726 Aug 14  2014 glove.twitter.27B.25d.txt


In [20]:
!head -n 1 /kaggle/working/glove.twitter.27B.100d.txt
!tail -n 1 /kaggle/working/glove.twitter.27B.100d.txt

<user> 0.63006 0.65177 0.25545 0.018593 0.043094 0.047194 0.23218 0.11613 0.17371 0.40487 0.022524 -0.076731 -2.2911 0.094127 0.43293 0.041801 0.063175 -0.64486 -0.43657 0.024114 -0.082989 0.21686 -0.13462 -0.22336 0.39436 -2.1724 -0.39544 0.16536 0.39438 -0.35182 -0.14996 0.10502 -0.45937 0.27729 0.8924 -0.042313 -0.009345 0.55017 0.095521 0.070504 -1.1781 0.013723 0.17742 0.74142 0.17716 0.038468 -0.31684 0.08941 0.20557 -0.34328 -0.64303 -0.878 -0.16293 -0.055925 0.33898 0.60664 -0.2774 0.33626 0.21603 -0.11051 0.0058673 -0.64757 -0.068222 -0.77414 0.13911 -0.15851 -0.61885 -0.10192 -0.47 0.19787 0.42175 -0.18458 0.080581 -0.22545 -0.065129 -0.15328 0.087726 -0.18817 -0.08371 0.21779 0.97899 0.1092 0.022705 -0.078234 0.15595 0.083105 -0.6824 0.57469 -0.19942 0.50566 -0.18277 0.37721 -0.12514 -0.42821 -0.81075 -0.39326 -0.17386 0.55096 0.64706 -0.6093
ﾟﾟﾟｵﾔｽﾐｰ -0.028777 -0.72607 -0.8277 0.34967 0.84427 0.55021 0.42523 -0.69503 0.35228 -1.2415 -0.15464 0.077556 0.94197 -0.59194 0.2861

In [21]:
import numpy as np
import collections

def load_glove_model(file_path, size, verbose = 1):
    model = collections.defaultdict(lambda: np.array([0.0 for _ in range(size)]))
    with open(file_path) as f:
        for line in f:
            tokens = line.split(' ')
            word = tokens[0]
            embeddings = np.array([float(value) for value in tokens[1:]])
            model[word] = embeddings
    if verbose >= 1:
        print("Words loaded!")
    return model

In [27]:
# Read the GloVe with pandas
import numpy as np

embedding_size = 100
glove_file_path = "/kaggle/working/glove.twitter.27B.100d.txt"
glove_embeddings = load_glove_model(glove_file_path, embedding_size)

Words loaded!


In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
import numpy as np

print(dataset.vocab_size)
embedding_matrix = np.zeros((dataset.vocab_size, embedding_size))
for word, i in dataset.tokenizer.word_index.items():
    embedding_matrix[i] = glove_embeddings[word]
    
model = Sequential()
embedding_layer = Embedding(
    dataset.vocab_size,
    embedding_size,
    embeddings_initializer=Constant(embedding_matrix),
    input_length=dataset.max_sequence_length,
    trainable=False)
model.add(embedding_layer)
model.add(LSTM(100))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(dataset.train_sequences, dataset.target_data, epochs = 50)

29320
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f702c1d7430>

In [44]:
# np.array([0 for i in range(len(test_data))])

output = model.predict(dataset.test_sequences)
answer = [0 if row[0] > row[1] else 1 for row in output]

predictions = pd.DataFrame({
    'id': test_data['id'],
    'target': answer
})

predictions.to_csv("submission.csv", index = False)

