# Kaggle-project : Natural Language Processing with Disaster Tweets

### 1. Importing 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
#import torch
from transformers import TFBertForSequenceClassification
#from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from tensorflow import keras

In [None]:
# loading BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### 2. Loading the data 

In [None]:
train = pd.read_csv("/kaggle/input/natural-language-processing-with-disaster-tweets/kaggle nlp/train.csv")
test = pd.read_csv("/kaggle/input/natural-language-processing-with-disaster-tweets/kaggle nlp/test.csv")
submission = pd.read_csv("/kaggle/input/natural-language-processing-with-disaster-tweets/kaggle nlp/sample_submission.csv")
print("Train shape : ",train.shape)

In [None]:
train.head()
print(test.shape)

### 3. Data Cleaning

In [None]:
#missing values
for col in train:
  is_null = train[col].isnull().sum()
  print(col + ":" + str(is_null))

In [None]:
import html
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
new_train = train.drop_duplicates('text',keep='first')
new_train.head()

In [None]:
!pip install demoji

In [None]:
print(new_train.shape)
print(train.shape)
import demoji
import emoji
for row in new_train['text']:
    #print(row)
    row=re.sub(r'https*\S+', ' ', row) # removing only links
    #row = re.sub(r"(@[A-Za-z0–9_]+)|[^\w\s]|#|http\S+", "", row) # removing links and hash characters
    #row = emoji.demojize(row, delimiters=("", ""))
    #print(row)


### 4. Splitting the data

In [None]:
train_sentences = new_train.text.values # for cleaned data
labels = new_train.target.values #for cleaned data
#train_sentences = train.text.values
#labels = train.target.values
len(labels)

In [None]:
train_input,val_input,train_label,val_label= train_test_split(train_sentences,labels,test_size=0.1, random_state=38)

### 5. BERT Tokens, paddings and masks

In [None]:
#do_encode does following:
#1. splits setence into tokens.
#2. adds '[CLS]' and '[SEP]'
#3. maps tokens to id's
#4. adds paddings
#5. creates attention masks
def do_encode(sentences,maximum):
    ids =[]
    attention_masks = []
    for sentence in sentences:
        encoded = tokenizer.encode_plus(sentence,add_special_tokens = True, max_length = maximum,
                                       pad_to_max_length = True,
                                       return_attention_mask = True)
        ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    ids = tf.convert_to_tensor(ids)
    attention_masks = tf.convert_to_tensor(attention_masks)
    return ids,attention_masks
    

In [None]:
# Finding max length as it is required for BERT padding
train_max = max([len(sen) for sen in train_input])
val_max = max([len(sen) for sen in val_input])
print(train_max)
print(val_max)

In [None]:
#Encoding train data
train_ids,train_masks =do_encode(train_input,train_max) # we will now encode train text values
print(train_ids.shape)
print(train_masks.shape)

In [None]:
#Encoding validation data
val_ids,val_masks = do_encode(val_input,val_max)
print(val_ids.shape)
print(val_masks.shape)

### 6. Training the model

In [None]:
#Converting to tensor objects
train_label = tf.convert_to_tensor(train_label)
val_label = tf.convert_to_tensor(val_label)

In [None]:
# We are using pretrained BertForSequenceClassification model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

In [None]:
#Code for setup from here: https://medium.com/@yashvardhanvs/classification-using-pre-trained-bert-model-transfer-learning-2d50f404ed4c
output_dir="./result"
model_save = "./model/"
# callbacks are not necessary but will improve the results
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save,
                                               save_weights_only=True,
                                               monitor='val_loss',
                                               mode='min',
                                               save_best_only = True),keras.callbacks.TensorBoard(log_dir=output_dir)]

print(bert_model.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

#compiling the model - configures the model for  training
bert_model.compile(loss=loss,optimizer = optimizer,metrics=[metric])


In [None]:
# trains the model for fixed amount of epochs(dataset iterations) - in our case 4 epochs
history = bert_model.fit([train_ids,train_masks],train_label,batch_size=32,epochs=4,validation_data=([val_ids,val_masks],val_label),callbacks=callbacks)

### 7. Results

In [None]:
from sklearn.metrics import classification_report
val_prediction = bert_model.predict([val_ids,val_masks])
val_pred = np.argmax(val_prediction.logits, axis=1)
print(classification_report(val_label, val_pred))

In [None]:
print(history.history.keys())
print(history.history['val_loss'])
print(history.history['val_accuracy'])

In [None]:
test_sentences = test.text.values
test_max = max([len(sen) for sen in test_sentences]) # getting maximum sentence length for paddings
test_ids,test_masks = do_encode(test_sentences,test_max) # encoding

In [None]:
# Generating output predictions for the input samples
prediction = bert_model.predict([test_ids,test_masks])
predicted_labels = np.argmax(prediction.logits, axis=1) #probability values from 0 to 1

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy with only removing links')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','validation'],loc='upper left')
plt.savefig('accuracy.pdf')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss with only removing links')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','validation'],loc='upper left')
plt.savefig('loss.pdf')
plt.show()

In [None]:
print(prediction)

In [None]:
# creating submission dataframe and submission file
submit = pd.DataFrame({'id':submission.id,'target':predicted_labels})
submit.to_csv('submission_with_removing_links.csv',index=False)
submit.head()

### References:

* https://medium.com/@yashvardhanvs/classification-using-pre-trained-bert-model-transfer-learning-2d50f404ed4c 
* 
https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=6J-FYdx6nFE
* https://www.kaggle.com/code/gazu468/all-about-bert-you-need-to-know/notebook
* https://www.youtube.com/watch?v=zJW57aCBCTk
* https://www.youtube.com/watch?v=x66kkDnbzi4
* https://www.youtube.com/watch?v=Hnvb9b7a_Ps