<a href="https://www.kaggle.com/code/barborakudlov/nlp-disastertweets-lstm-acc-0-98-val-acc-0-97?scriptVersionId=119887235" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import the Necessary Libraries

In [None]:
#For modelling NLP
import tensorflow as tf 
import tensorflow_hub as hub

In [None]:
#For cleaning
import spacy 
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import regex as re 
import string 

#For visualizing
import matplotlib.pyplot as plt 
import seaborn as sns 
pd.set_option('display.max_colwidth', None)

#For modelling NLP
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.models import Sequential

## Import the Data

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

ids = test.id

In [None]:
print('shape of training set: ', train.shape)
print('shape of testing set: ', test.shape)

# Data Analysing- Missing values

In [None]:
#Concatenating the traind and test data
df_concat = pd.concat([train, test], axis=0).reset_index(drop=True)

nulls = pd.DataFrame(np.c_[df_concat.isnull().sum(), (df_concat.isnull().sum()/len(df_concat))*100], 
                     columns=['# of nulls', '% of nulls'], 
                     index=df_concat.columns)

nulls

In [None]:
for df in [train, test, df_concat]:
    df['keyword'].fillna('no_keyword', inplace=True)
    df['location'].fillna('no_location', inplace=True)

In [None]:
df_concat.groupby('location')['text'].count().sort_values(ascending=False)

# Visualizing the data keywords

- keywords have really important impact on if tweet is reporting disasters or non-disasters event

In [None]:
#seperation of disasters and non-disasters tweets and group them by this keywords, what are the most repeated
count_dis_keywords = train[train['target'] == 1].groupby('keyword').count().sort_values(by='target', ascending=False)[:20]

count_non_dis_keywords = train[train['target'] == 0].groupby('keyword').count().sort_values(by='target', ascending=False)[:20]

sns.set(style="white")

fig, axs = plt.subplots(1, 2, figsize=(25, 8))

#left plot- disasters keywords
sns.barplot(x=count_dis_keywords['target'], 
            y=count_dis_keywords.index,
            ax=axs[0],
            palette='Reds_r',
            label='dis')

#right plot- non-disasters keywords
sns.barplot(x=count_non_dis_keywords['target'], 
            y=count_non_dis_keywords.index,
            ax=axs[1],
            palette='Greens_d',
            label='non_dis')

for ax in [axs[0], axs[1]]:
    ax.set_title('Number of tweets per keyword', fontsize=15)
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.set_yticklabels(labels=ax.get_yticklabels(), fontsize=15)

- we can see from red left plot, that a lot of disasters tweets do not have any keywords ()
- we can see from green green plot, that a data arent cleaned because of body%20bags so it will be the next step to classify the text as best as we can

In [None]:
#dropping the columns with a lot of NaN values
for df in [train, test, df_concat]:
    df.drop(columns=['location', 'keyword', 'id'], inplace=True)

# NLP Pipeline- Normalization(cleaning)

In [None]:
#for cleaning we will use 2 libraries- NLTK and SpaCy and in the end, we will choose the best one from these two ith the best result
nlp = spacy.load('en_core_web_sm')
sp = spacy.load('en_core_web_sm')

nltk.download('stopwords')
nltk.download('punkt')

spacy_st = nlp.Defaults.stop_words
nltk_st = stopwords.words('english')

In [None]:
def clean1(tweet, http=True, punc=True, lem=True, stop_w=True):
    
    #substituion all the parts of the text, that has this formats 
    if http == True:
        tweet = re.sub('https?:\/\/t.co\/[A-Za-z0-9]*', '', tweet)
    
    #choose only one from NLTK or SpaCy with the best results
    if stop_w == 'nltk':
        tweet = [word for word in word_tokenize(tweet) if not word.lower() in nltk_st]
        tweet = ' '.join(tweet)
    
    #SpaCy has more stopwords around 400- so it can delete larger parts for my text, so that is the reason, that we will firstly use NLTK
    elif stop_w == 'spacy':
        tweet = [word for word in word_tokenize(tweet) if not word.lower() in spacy_st]
        tweet = ' '.join(tweet)
      
    
    #lemmitizing
    if lem == True:
        lemmatized = [word.lemma_ for word in sp(tweet)]
        tweet = ' '.join(lemmatized)
    
    #punctionation removal using translate- substitueting with blank space
    #Punctuation marks are symbols such as period (.), comma (,), semicolon (;), colon (:), question mark (?), exclamation mark (!), parentheses (), brackets [], braces {}, quotation marks "", apostrophe ('), and many others.
    if punc == True:
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    #removing the extra space that may be around words or letters
    tweet = re.sub('\s+', ' ', tweet)
    
    return tweet

def clean2(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
df_concat['cleaned_text'] = df_concat['text'].apply(lambda x: clean1(x, lem=False, stop_w='nltk', http=True, punc=True)).apply(lambda x: clean2(x)) #This code will output the cleaned text for each tweet in the DataFrame (after clean1 and clean2)

# df_concat['cleaned_text'] = [nlp(text) for text in df_concat['cleaned_text']]

In [None]:
#split the train and test data back
cleaned_train = df_concat[:train.shape[0]] # So cleaned_train will contain the first train.shape[0] rows of df_concat, which are assumed to be the rows used for training a model.

cleaned_test = df_concat[train.shape[0]:]

cleaned_train.drop(columns=['text'], inplace=True)

cleaned_test.drop(columns=['text'], inplace=True)

In [None]:
cleaned_train

In [None]:
X_total = df_concat['cleaned_text']

X_train = cleaned_train['cleaned_text']
y_train = cleaned_train['target']

X_test = cleaned_test['cleaned_text']

In [None]:
#?? zeptat se Standy
vectorizers = CountVectorizer(binary=True, 
                              ngram_range=(1, 3), 
                              stop_words='english')

X_total_vectors = vectorizers.fit_transform(X_total)

print(vectorizers.get_feature_names_out())

print(X_total_vectors.toarray())

# Word Embeddings

## Suport Vector Machine(SVM)

In [None]:
# zeptat se Standy??
from sklearn import svm # Support Vector Machine (SVM) classifier is being used to classify the text data

clf_svm = svm.SVC(kernel='linear')

X_train_vectors = vectorizers.transform(X_train)

clf_svm.fit(X_train_vectors, y_train)

In [None]:
X_test_vectors = vectorizers.transform(X_test)

print(vectorizers.get_feature_names_out())

print(X_test_vectors.toarray())

In [None]:
# sub_sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

# sub_sample

In [None]:
# sub_sample['target'] = clf_svm.predict(X_test_vectors)

# sub_sample['target'] = sub_sample['target'].astype('int')

# sub_sample.to_csv('submission.csv', index=False)

In [None]:
# submission = pd.read_csv('/kaggle/working/submission.csv')

# submission

## Bidirectional LSTM

In [None]:
from tensorflow.keras.layers import TextVectorization 

In [None]:
X_total = df_concat['cleaned_text']

X_train = cleaned_train['cleaned_text']
y_train = cleaned_train['target'].values

X_test = cleaned_test['cleaned_text']

In [None]:
len(X_total), len(X_train), len(X_test)

In [None]:
# Define for Embeddings- zeptat se Standy?
MAX_FEATURES = 20000 #  20 000most frequent words in the input text data.

vectorizer = TextVectorization(max_tokens=MAX_FEATURES, 
                               output_sequence_length=200, 
                               output_mode='int')

In [None]:
vectorizer.adapt(X_total.values)

vectorizer.get_vocabulary()

In [None]:
vectorizerd_text = vectorizer(X_train.values)

dataset = tf.data.Dataset.from_tensor_slices((vectorizerd_text, y_train))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(32) 
dataset = dataset.prefetch(8)

In [None]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

batch_X.shape, batch_y.shape

In [None]:
len(dataset)

train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [None]:
len(train), len(val), len(test)

In [None]:
model = Sequential()

model.add(Embedding(MAX_FEATURES + 1, 64))
model.add(Bidirectional(LSTM(64, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='BinaryCrossentropy', optimizer='Adam', metrics=['accuracy'])

model.summary()

In [None]:
hist= model.fit(train, epochs=30, batch_size=32, validation_data=val)

In [None]:
plt.figure(figsize=(8, 5))

pd.DataFrame(hist.history).plot()

plt.show()

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

def output(num):
    if num <= 0.5:
        return 0
    else: 
        return 1
    

for batch in test.as_numpy_iterator():
    
    X_true, y_true = batch
    
    y_batchout = []
    
    yhat = model.predict(X_true)
    
    for num in yhat:
        y_batchout.append(output(num))
    
#     y_true = y_true.flatten()
#     yout = yout.flatten()
    
    print(y_true), print(y_batchout)
    
    pre.update_state(y_true, y_batchout)
    re.update_state(y_true, y_batchout)
    acc.update_state(y_true, y_batchout)

print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

In [None]:
vectorizerd_test_text = vectorizer(X_test.values)

# testset = tf.data.Dataset.from_tensor_slices((vectorizerd_test_text))
# testset = dataset.cache()
# testset = dataset.shuffle(160000)
# testset = dataset.batch(8) 
# testset = dataset.prefetch(8)

In [None]:
preds = []

for input_text in vectorizerd_test_text:
    
    pred = model.predict(np.expand_dims(input_text, 0))
    
    preds.append(pred)

In [None]:
# preds

In [None]:
final_preds = []

for pred in preds:
    
    final_preds.append(output(pred))

len(final_preds)

In [None]:
# final_preds

In [None]:
sub_sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

sub_sample

In [None]:
sub_sample['target'] = final_preds

sub_sample['target'] = sub_sample['target'].astype('int')

sub_sample.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv('/kaggle/working/submission.csv')

submission

# End