In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv


In [None]:
df_train = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv", engine = 'python')
df_test = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv", engine = 'python')

In [None]:
df = df_test.append(df_train)

In [None]:
df

In [None]:
sentiment = {"Extremely Negative":0, "Negative":1, "Neutral":2, "Positive":3, "Extremely Positive":4}

In [None]:
df.Sentiment = df.Sentiment.map(sentiment)

In [None]:
sns.countplot(df.Sentiment)

In [None]:
# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")


In [None]:

def preprocessing(text):
    re_text = "@\S+|http+\S+|\s[^A-Za-z0-9]\S+"
    text = re.sub(re_text, ' ', text).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(stemmer.stem(token))
            
    return ' '.join(tokens)

In [None]:
%%time
df.OriginalTweet = df.OriginalTweet.apply(lambda x: preprocessing(x))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.OriginalTweet, df.Sentiment, test_size = 0.1)

# **W2V**

In [None]:
import gensim


# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10


In [None]:
documents = [txt.split(' ') for txt in X_train]

In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

In [None]:
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.key_to_index
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

In [None]:
%%time
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
%%time
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=300)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=300)

In [None]:
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

In [None]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [None]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=300, trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['val_accuracy'])

In [None]:
callbacks = [ ReduceLROnPlateau(monitor='val_accuracy', patience=5, cooldown=0),
              EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [None]:
%%time
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=20,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
%%time
score = model.evaluate(X_test, y_test, batch_size=300)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
import matplotlib.pyplot as plt


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()

In [None]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = "NEUTRAL"
        if score <= 0.4:
            label = "NEGATIVE"
        elif score >= 0.7:
            label = "POSITIVE"

        return label
    else:
        return "NEGATIVE" if score < 0.5 else "POSITIVE"

In [None]:
import time
def predict(text, include_neutral=True):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}  

In [None]:

predict("i don't know what i'm doing")

In [25]:
df_train = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv", engine = 'python')
df_test = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv", engine = 'python')
df = df_train.append(df_test)

In [26]:
df.OriginalTweet

0        TRENDING: New Yorkers encounter empty supermar...
1        When I couldn't find hand sanitizer at Fred Me...
2        Find out how you can protect yourself and love...
3        #Panic buying hits #NewYork City as anxious sh...
4        #toiletpaper #dunnypaper #coronavirus #coronav...
                               ...                        
41152    Airline pilots offering to stock supermarket s...
41153    Response to complaint not provided citing COVI...
41154    You know its getting tough when @KameronWilds...
41155    Is it wrong that the smell of hand sanitizer i...
41156    @TartiiCat Well new/used Rift S are going for ...
Name: OriginalTweet, Length: 44955, dtype: object

In [27]:
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocessing(text):
    re_text = "@\S+|http+\S+|\s[^A-Za-z0-9]\S+|[^A-Za-z0-9\s]"
    text = re.sub(re_text, ' ', text).strip()
    text = text.lower()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(stemmer.stem(token))
            
    return ' '.join(tokens)

In [28]:
preprocessing("Hello, World!")

'hello world'

In [31]:
df.OriginalTweet = df.OriginalTweet.apply(lambda x: preprocessing(x))

# Tokenizer

In [43]:
texts = df.OriginalTweet.values

In [68]:
%%time
from nltk.tokenize import word_tokenize
# from keras.preprocessing.sequence import pad_sequences


def tokenize(texts):
    text_list = []
    for text in texts:
        text_list.append(word_tokenize(text))
    return text_list

tokenized_texts = tokenize(texts)

# vocab_size = len(tokenizer.word_index) + 1
# print("Total words", vocab_size)

CPU times: user 9.28 s, sys: 44.5 ms, total: 9.33 s
Wall time: 9.33 s


# Padding

In [78]:
def pad(texts):
    maxlen = 0
    new_texts = []
    for text in texts:
        maxlen = max(maxlen, len(text))
    
    for text in texts:
        new_texts.append(["p"] * (maxlen - len(text)) + text)
    return new_texts

In [79]:
padded_texts = pad(tokenized_texts)

In [81]:
import gensim



In [92]:
%%time
w2v_model =  gensim.models.word2vec.Word2Vec(padded_texts, min_count= 1, vector_size=100, workers=10, sg=1, hs=1, window=10)

CPU times: user 2min 21s, sys: 308 ms, total: 2min 22s
Wall time: 38.6 s


In [93]:
w2v_model.build_vocab(padded_texts)

In [94]:
words = w2v_model.wv.key_to_index
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 25185


In [22]:
%%time
y_pred_1d = []
y_test_1d = list(y_test)
scores = model.predict(X_test, verbose=1, batch_size=8000)
y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in scores]

NameError: name 'y_test' is not defined