In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
nltk.data.path.append('/kaggle/working')
nltk.download("all", download_dir='/kaggle/working')
nltk.download("wordnet", download_dir='/kaggle/working')

import os
os.environ['NLTK_DATA'] = '/kaggle/working'

!mkdir /kaggle/working/corpora/wordnet
!unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora

In [None]:
train_file_path = "/kaggle/input/nlp-getting-started/train.csv"
train_data = pd.read_csv(train_file_path)

print(train_data.columns)

test_file_path = "/kaggle/input/nlp-getting-started/test.csv"
test_data = pd.read_csv(test_file_path)

print(test_data.columns)

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Class responsible for handling the input data
class Dataset:
    train_data = None
    learn_data = None
    assess_data = None
    test_data = None
    combined_data = None
    
    target_data = None
    
    combined_encoded_text = None
    train_encoded_text = None
    test_encoded_text = None
    learn_encoded_text = None
    assess_encoded_text = None
        
    # data is pandas DataFrame
    def __init__(self, train_data, test_data, learn_ratio, assess_ratio):
        assert learn_ratio + assess_ratio == 1, \
        "The sum of learn_ratio and assess_ratio should be equal to 1"
        
        self.train_data = train_data
        self.test_data = test_data
        
        self.target_data = self.train_data.iloc[:, -1]
        self.combined_data = pd.concat([self.train_data.iloc[:, :-1], self.test_data])
        
        encoded_documents = self.encode_documents(self.combined_data['text'])
        self.combined_encoded_text = pd.DataFrame.sparse.from_spmatrix(encoded_documents)
        self.train_encoded_text = self.combined_encoded_text.iloc[:len(self.train_data)]
        self.test_encoded_text = self.combined_encoded_text.iloc[len(self.train_data):]
        
        learn_len = int(learn_ratio * len(self.train_data))
        self.learn_data  = self.train_data.iloc[:learn_len]
        self.assess_data = self.train_data.iloc[learn_len:]
        self.learn_data_encoded_text = self.train_encoded_text[:learn_len]
        self.assess_data_encoded_text = self.train_encoded_text[learn_len:]
        
    def tokenize(self, text):
        words = re.findall(r'\w+', text.lower())
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in words]
    
    def encode_documents(self, documents):
        vectorizer = TfidfVectorizer(tokenizer = self.tokenize)
        return vectorizer.fit_transform(documents)
        

In [None]:
dataset = Dataset(train_data, test_data, 0.8, 0.2)

assert len(dataset.learn_data) + len(dataset.assess_data) == len(dataset.train_data), \
"The size of the learn_data and assess_data should total to the size of data"

print("Learn dataset len={}".format(len(dataset.learn_data)))
print("Assess dataset len={}".format(len(dataset.assess_data)))
print("Total dataset len={}".format(len(dataset.train_data)))

In [None]:
# Neural Network model

import tensorflow as tf
from tensorflow import keras

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

input_size = dataset.train_encoded_text.shape[1]
hidden_layer_size = 10

model = keras.models.Sequential()
model.add(keras.layers.Dense(hidden_layer_size, activation='swish', input_shape=(input_size,)))
model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(dataset.train_encoded_text, dataset.target_data, epochs = 50)

In [1]:
# Download the word embeddings
!curl -o /kaggle/working/glove.twitter.27B.zip https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1449M  100 1449M    0     0  5204k      0  0:04:45  0:04:44  0:00:01 5118k0:03:55 4845k  0  0:04:39  0:01:40  0:02:59 5238k04:42  0:02:47  0:01:55 5131k:04:43  0:03:23  0:01:20 5116k04:45  0:04:45 --:--:-- 5126k


In [12]:
# Unzip the downloaded word embeddings
!unzip /kaggle/working/glove.twitter.27B.zip -d /kaggle/working
!ls -alt /kaggle/working

Archive:  /kaggle/working/glove.twitter.27B.zip
  inflating: /kaggle/working/glove.twitter.27B.25d.txt  
  inflating: /kaggle/working/glove.twitter.27B.50d.txt  
  inflating: /kaggle/working/glove.twitter.27B.100d.txt  
  inflating: /kaggle/working/glove.twitter.27B.200d.txt  
total 5242480
drwxr-xr-x 3 root root       4096 Jun 21 20:15 .
-rw-r--r-- 1 root root 1520408563 Jun 21 20:12 glove.twitter.27B.zip
drwxr-xr-x 2 root root       4096 Jun 21 19:59 .virtual_documents
drwxr-xr-x 5 root root       4096 Jun 21 19:59 ..
---------- 1 root root        263 Jun 21 19:59 __notebook_source__.ipynb
-rw-rw-r-- 1 root root 2057590469 Aug 14  2014 glove.twitter.27B.200d.txt
-rw-rw-r-- 1 root root 1021669379 Aug 14  2014 glove.twitter.27B.100d.txt
-rw-rw-r-- 1 root root  510887943 Aug 14  2014 glove.twitter.27B.50d.txt
-r--r--r-- 1 root root  257699726 Aug 14  2014 glove.twitter.27B.25d.txt


In [15]:
!head -n 1 /kaggle/working/glove.twitter.27B.100d.txt
!tail -n 1 /kaggle/working/glove.twitter.27B.100d.txt

<user> 0.63006 0.65177 0.25545 0.018593 0.043094 0.047194 0.23218 0.11613 0.17371 0.40487 0.022524 -0.076731 -2.2911 0.094127 0.43293 0.041801 0.063175 -0.64486 -0.43657 0.024114 -0.082989 0.21686 -0.13462 -0.22336 0.39436 -2.1724 -0.39544 0.16536 0.39438 -0.35182 -0.14996 0.10502 -0.45937 0.27729 0.8924 -0.042313 -0.009345 0.55017 0.095521 0.070504 -1.1781 0.013723 0.17742 0.74142 0.17716 0.038468 -0.31684 0.08941 0.20557 -0.34328 -0.64303 -0.878 -0.16293 -0.055925 0.33898 0.60664 -0.2774 0.33626 0.21603 -0.11051 0.0058673 -0.64757 -0.068222 -0.77414 0.13911 -0.15851 -0.61885 -0.10192 -0.47 0.19787 0.42175 -0.18458 0.080581 -0.22545 -0.065129 -0.15328 0.087726 -0.18817 -0.08371 0.21779 0.97899 0.1092 0.022705 -0.078234 0.15595 0.083105 -0.6824 0.57469 -0.19942 0.50566 -0.18277 0.37721 -0.12514 -0.42821 -0.81075 -0.39326 -0.17386 0.55096 0.64706 -0.6093
ﾟﾟﾟｵﾔｽﾐｰ -0.028777 -0.72607 -0.8277 0.34967 0.84427 0.55021 0.42523 -0.69503 0.35228 -1.2415 -0.15464 0.077556 0.94197 -0.59194 0.2861

In [None]:
import numpy as np

def load_glove_model(file_path, size):
    model = defaultdict(lambda: np.array([0.0 for _ in range(size)]))
    with open(file_path) as f:
        for line in f:
            tokens = line.split(' ')
            word = tokens[0]
            embeddings = np.array([float(value) for value in tokens[1:]])
            model[word] = embeddings
    return model

In [17]:
# Read the GloVe with pandas
import numpy as np

glove_file_path = "/kaggle/working/glove.twitter.27B.100d.txt"
glove_embeddings = pd.read_csv(glove_file_path)

print(glove_embeddings)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 5, saw 2


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
import numpy as np

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [None]:
# np.array([0 for i in range(len(test_data))])

output = model.predict(dataset.test_encoded_text)
answer = [0 if row[0] > row[1] else 1 for row in output]

predictions = pd.DataFrame({
    'id': test_data['id'],
    'target': answer
})

predictions.to_csv("submission.csv", index = False)