In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
! python -m spacy download en_core_web_lg

**Load Data**

In [3]:
import matplotlib.pyplot as plt 
import seaborn as sns 

import re 
import collections

import nltk 
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 

import spacy 
import emoji 

import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression , Lasso , Huber , RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import tensorflow as tf 

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

In [6]:
train_data.head()

Methods For Cleaning Tweets

For this notebook, I will follow the below sequence of operations for cleaning and removing duplicate tweets.

1. For this notebook, I will follow the below sequence of operations for cleaning and removing duplicate tweets.
2. Clean URL's/Hyperlinks from tweets and look for duplicates.
3. Separate tweets based on category and do some exploratory analysis.
4. Write methods for specific cases e.g removing URLs, removing emojis etc.
5. Clean both the train and test tweets and again look for duplicates in training data.

In [7]:
def remove_urls(text):
    """Remove any URL/Hyperlink in the tweet"""
    text = re.sub(r"(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*","",text)
    return text

def clean_punc_url_stopw_single(text):
    """ Using SpaCy pipeline to clean and lemmatize
    
    Using SpaCy pipeline to iterate over tweet words
    and remove stopwords/URLs/email/punctuation
    and also lemmatize tweet.
    """
    
    clean_text = ""
    doc_text = nlp(text)
    for token in doc_text:
        if(token.like_url or token.like_email or token.is_punct or token.is_stop):
            continue
        else:
            clean_text += token.lemma_ + " "
    return clean_text[:-1].strip()

def clean_tags_mentions_single(txt):
    """Using Regex to remove hashtags/mentions
    
    Remove any hashtag/mention from tweet
    plus some more cleaning.
    """
    patt_mention = r"[@]\w+"
    patt_tags = r"[#]\w+"
    clean_str = re.sub(patt_mention, "", txt)
    clean_str = re.sub(patt_tags, "", clean_str)
    clean_str = re.sub(r'[0-9]+', '', clean_str)
    clean_str = re.sub("'ve", " have ", clean_str)
    clean_str = re.sub("&amp;", "", clean_str)
    clean_str = re.sub("\n", "", clean_str)
    return " ".join(clean_str.split())



def remove_emojis_single(text):
    """Remove emojis from tweet"""
    return emoji.get_emoji_regexp().sub(u'' , text)


def remove_accents(string):
    """Remove accents(á, é, í etc) from tweet"""
    nfkd_form  = unicodedata.normalize('NFKD' , string)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def remove_single_chars(text):
    """Remove any stray single characters"""
    return ' '.join( [w for w in text.split() if len(w)>1] )

def remove_non_ascii(text):
    """Remove any non-ascii characters"""
    return re.sub(r'[^\x00-\x7F]+','', text)

def remove_remaining_punct(text):
    """Remove any remaining puctuation"""
    res1 = re.sub(r"[!@#$%^&*()_\-=+}{\[\]|\\/<>,.?~`';]+", " ", text)
    res2 = " ".join(res1.split())
    return res2

def clean_tweets_efficient(all_tweets):
    ''' Combine individual methods
      
    Use all the cleaning functions to remove emojis, punctuations
    hyperlinks, stopwords, tags, mentions and non-english words/characters
    '''
    clean_tweets = list()
    for tweet in all_tweets:
        non_accents = remove_accents(remove_non_ascii(tweet))
        clean_level_1 = remove_emojis_single(non_accents)
        clean_level_2 = clean_tags_mentions_single(clean_level_1)
        clean_level_3 = clean_punc_url_stopw_single(clean_level_2)
        clean_level_4 = remove_single_chars(clean_level_3)
        clean_tweets.append(clean_level_4)
        
    return clean_tweets


def get_wrd_count(text_lst):
    """Get Word counters for EDA"""
    all_words = []
    tokenizer = RegexpTokenizer(r'\w+')
    for txt in text_lst:
        words = tokenizer.tokenize(txt)
        all_words.extend(words)
    word_counter = collections.Counter(all_words)
    return word_counter

In [8]:
clean_urls_train = train_data['text'].apply(lambda x : remove_urls(x))

In [9]:
train_data_duplicates = train_data[clean_urls_train.duplicated()]

In [10]:
# let's have a quick look at duplicates after removing urls
print("total duplicates found after removing urls : {}".format(train_data_duplicates.shape[0]))

In [11]:
train_data['text'] = train_data['text'].apply(lambda x : remove_urls(x))
test_data['clean_text'] = test_data['text'].apply(lambda x : remove_urls(x))

In [12]:
train_data = train_data.drop_duplicates(subset='text')

In [13]:
#separate tweets by category
tweets_0 = train_data[train_data['target']==0]
tweets_1 = train_data[train_data['target']==1]
tweets_test = test_data['clean_text']

In [14]:
tweets_0_text_lst = tweets_0['text'].tolist()
tweets_1_text_lst = tweets_1['text'].tolist()
tweets_test_text_lst = tweets_test.tolist()

In [15]:
#clean tweets for both training and testing data 
clean_tweets_0 = clean_tweets_efficient(tweets_0_text_lst)
clean_tweets_1 = clean_tweets_efficient(tweets_1_text_lst)
clean_test_tweets = clean_tweets_efficient(tweets_test_text_lst)

In [16]:
#combine and look for duplicates 
all_cleaned_tweets = clean_tweets_0 + clean_tweets_1
all_cleaned_tweets_with_test = clean_tweets_0 + clean_tweets_1 + clean_test_tweets #useful while modeling 
all_labels = [0]*len(clean_tweets_0) + [1]*len(clean_tweets_1)

In [17]:
df_clean_tweets = pd.DataFrame(data={"text" :all_cleaned_tweets , "label":all_labels})

In [18]:
df_clean_tweets.head()

In [19]:
df_clean_tweets[df_clean_tweets.duplicated(subset='text')]
# Again, we should have a look at our duplicates

In [20]:
df_clean_tweets_dedep = df_clean_tweets.drop_duplicates(subset='text')

In [21]:
# Remaining training data
print(df_clean_tweets_dedep.shape)
print(df_clean_tweets_dedep[df_clean_tweets_dedep['label']==0].shape)
print(df_clean_tweets_dedep[df_clean_tweets_dedep['label']==1].shape)

**EDA**

In [22]:
sns.countplot(x='label' , data=df_clean_tweets_dedep);
plt.title("Target Distribution");

In [23]:
tweets_0_cleaned = df_clean_tweets_dedep[df_clean_tweets_dedep['label']==0]
tweets_1_cleaned = df_clean_tweets_dedep[df_clean_tweets_dedep['label']==1]

In [24]:
tweet_0_len = tweets_0_cleaned['text'].apply(lambda x:len(x) )
tweet_1_len = tweets_1_cleaned['text'].apply(lambda x:len(x) )

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
sns.histplot(tweet_0_len, ax=ax[0]);
ax[0].set_title("Tweet Length distribution for Non Disaster Tweets");
sns.histplot(tweet_1_len, ax=ax[1]);
ax[1].set_title("Tweet Length distribution for Disaster Tweets");
plt.tight_layout();

In [25]:
#Common words for each category

tweet0_wrd = get_wrd_count(tweets_0_cleaned['text'].tolist())
tweet1_wrd = get_wrd_count(tweets_1_cleaned['text'].tolist())

tweet0_wrd_cnt_sorted = tweet0_wrd.most_common(n=10)
tweet1_wrd_cnt_sorted = tweet1_wrd.most_common(n=10)
l0, h0, l1, h1 = [],[],[],[]
_ = [(l0.append(i[0]), h0.append(i[1])) for i in tweet0_wrd_cnt_sorted]
_ = [(l1.append(i[0]), h1.append(i[1])) for i in tweet1_wrd_cnt_sorted]
#print(l1, h1)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(13, 6));
sns.barplot(x=list(range(len(l0))), y=h0, ax=ax[0]);
ax[0].set_ylim(top=300);
ax[0].set_xticks(ticks = list(range(len(l0))));
ax[0].set_xticklabels(l0);
ax[0].set_xlabel('Words');
ax[0].set_ylabel('Count');
ax[0].set_title("Most common words for non disaster tweets");

sns.barplot(x=list(range(len(l1))), y=h1, ax=ax[1]);
ax[1].set_ylim(top=300);
ax[1].set_xticks(ticks = list(range(len(l1))));
ax[1].set_xticklabels(l1);
ax[1].set_xlabel('Words');
ax[1].set_ylabel('Count');
ax[1].set_title("Most common words for disaster tweets");

> **Modeling**

**Baseline with TF-IDF and Logistic Classifier**

In [26]:
all_cleaned_text = df_clean_tweets_dedep['text'].tolist()
all_cleaned_text_with_test = df_clean_tweets_dedep['text'].tolist() + clean_test_tweets
all_targets = df_clean_tweets_dedep['label'].tolist()

In [27]:
tfidf = TfidfVectorizer(ngram_range=(2,4) , max_df=1000 , min_df=10)
tfidf.fit(all_cleaned_text)

In [28]:
tfidf_feats = tfidf.transform(all_cleaned_text)

In [29]:
trainx , testvalx , trainy , testvaly = train_test_split(tfidf_feats , all_targets , test_size=0.4)
valx , testx , valy , testy = train_test_split(testvalx , testvaly , test_size=0.5)

In [30]:
lr = LogisticRegression(C=10)
lr.fit(trainx , trainy)

In [31]:
print(f"Validation Set Score: {lr.score(valx, valy)}")
print(f"Test Set Score: {lr.score(testx, testy)}")

In [32]:
print(classification_report(testy, lr.predict(testx)))


**RNN With GloVe Embeddings**

In [33]:
# Download and extract GloVe embeddings

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [34]:
# load pretrained Glove embeddings 
dict_w2v = {}
with open('./glove.6B.100d.txt' , "r") as file :
    for line in file :
        tokens = line.split()
        word = tokens[0]
        vector = np.array(tokens[1:] , dtype = np.float32)
        if vector.shape[0] == 100:
            dict_w2v[word] = vector
        else:
            print("there was an issue with " + word)
#let's check the vocab size 
print("Dictionary Size : " , len(dict_w2v))

In [35]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="")

# We use entire data  training + test to fit tokenizer
# so we can use it for predicting as well

tokenizer.fit_on_texts(all_cleaned_text_with_test)

In [36]:
NUM_WORDS = len(tokenizer.word_index) + 1
NUM_CLS = 2
MAX_LEN = 25
print("Total Words in tokenizer : {}".format(NUM_WORDS))

Load Embeddings

In [37]:
embedding_dim = 100
embedding_matrix = np.zeros((NUM_WORDS , embedding_dim))

unk_cnt = 0
unk_set = set()
for word in tokenizer.word_index.keys():
    embedding_vector = dict_w2v.get(word)
    if embedding_vector is not None:
        tkn_id = tokenizer.word_index[word]
        embedding_matrix[tkn_id] = embedding_vector
    else:
        unk_cnt += 1
        unk_set.add(word)
    
# Print how many weren't found
print("Total unknown words: ", unk_cnt)

Convert strings to indices and pad so all sequences are of same length



In [38]:
all_sequences = tokenizer.texts_to_sequences(all_cleaned_text)
all_padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(all_sequences, maxlen=MAX_LEN)

In [39]:
train_x , valtest_x , train_y , valtest_y = train_test_split(all_padded_sequences , np.asarray(all_targets , dtype=np.float32) , test_size= 0.2)
val_x , test_x , val_y , test_y = train_test_split(valtest_x , valtest_y , test_size=0.5)

Reshape the sets in appropriate shape with the batch size (32)

In [40]:
print(train_x.shape , val_x.shape , test_x.shape , train_y.shape , val_y.shape , test_y.shape)
train_x , train_y = train_x[:(train_x.shape[0]//32)*32 , :] , train_y[:(train_y.shape[0]//32)*32]
val_x , val_y = val_x[:(val_x.shape[0]//32)*32 , :] , val_y[:(val_y.shape[0]//32)*32]
test_x , test_y = test_x[:(test_x.shape[0]//32)*32 , :] , test_y[:(test_y.shape[0]//32)*32]


Custom layer for using in our model. It averages the hidden states computed for the entire sequence.



In [41]:
class MergeHiddenStates(tf.keras.layers.Layer):
    def __init__(self):
        super(MergeHiddenStates , self).__init__()
        
    def call(self , inputs):
        states = inputs
        return tf.reduce_mean(states , axis = 1)
    

Our RNN Model:

* Pass the input sequence through two embeddings, one initialized with the GloVe embeddings and second which is randomly initialized.
* We only train the second embedding and keep the GloVe embeddings constant.
* We concatenate the two embeddings and pass to RNN (GRU/LSTM).
* We get the entire sequence output from the RNN (hidden states/outpts).
* We pass this entire sequence to out custom layer to average the hidden states.
* Then simply pass the average vector to a Dense unit for prediction.


In [43]:
def create_model(input_shape = (MAX_LEN ,) , vocabsize = NUM_WORDS , emb_dim = 100 , rnn_units = 128 , batch_size = 32):
    inp = tf.keras.layers.Input(shape=input_shape , batch_size=batch_size , dtype=tf.int32)
    emb_fixed = tf.keras.layers.Embedding(vocabsize ,
                                         emb_dim,
                                         mask_zero=False,
                                         batch_input_shape=(batch_size , input_shape[0]),
                                         weights = [embedding_matrix] , trainable=False)
    
    emb_train = tf.keras.layers.Embedding(vocabsize ,
                                         emb_dim,
                                         mask_zero=False,
                                         batch_input_shape=(batch_size , input_shape[0]),
                                         weights = [embedding_matrix] , trainable=True)
    
    rnn_unit = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(rnn_units , dropout = 0.2 , return_sequences=True))
    
    x1 = emb_fixed(inp)
    x2 = emb_train(inp)
    x = tf.keras.layers.Concatenate()([x1 , x2])
    whole_sequence_output = rnn_unit(x)
    x = MergeHiddenStates()(whole_sequence_output)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128 , activation='relu')(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    preds = tf.keras.layers.Dense(NUM_CLS , activation='softmax')(x)
    model= tf.keras.Model(inputs = inp , outputs = preds)
    
    return model

In [44]:
model = create_model()
print(model.summary())

In [45]:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
model.compile(loss=loss_obj , optimizer = 'adam' , metrics=['accuracy'])

**Let's Train Our Model**

In [46]:
history = model.fit(train_x , train_y , batch_size=32 , epochs=10 , validation_data=(val_x , val_y) , shuffle=True)

In [47]:
history_1 = model.fit(train_x , train_y , batch_size=32 , epochs=5 , validation_data=(val_x , val_y) , shuffle=True)

In [48]:
test_preds = np.argmax(model.predict(test_x , batch_size=32),axis=1)

In [50]:
class_report = classification_report(test_y , test_preds)
print(class_report)

**Make predictions for Test set**

We basically have to repeat the steps we took for the RNN model.

This time we will use the entire training set to train our model and use it for final predictions.

In [51]:
all_sequences_train = tokenizer.texts_to_sequences(all_cleaned_text)
all_padded_sequences_train = tf.keras.preprocessing.sequence.pad_sequences(all_sequences_train , maxlen=MAX_LEN)
all_targets = np.asarray(df_clean_tweets_dedep['label'].tolist())

In [52]:
train_x , train_y = all_padded_sequences_train[:(all_padded_sequences_train.shape[0]//32)*32 , :],all_targets[:(all_targets.shape[0]//32)*32]

In [53]:
all_sequences_test = tokenizer.texts_to_sequences(clean_test_tweets)
all_padded_sequences_test = tf.keras.preprocessing.sequence.pad_sequences(all_sequences_test , maxlen=MAX_LEN)

In [55]:
all_padded_sequences_test_f = np.concatenate([all_padded_sequences_test , np.asarray(all_padded_sequences_test[-1,:]).reshape((1, MAX_LEN))],axis = 0)
print(all_padded_sequences_test_f.shape)

Create model for prediction and compile



In [59]:
prediction_model = create_model()
test_loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
prediction_model.compile(loss=test_loss_obj, optimizer='adam', metrics=['accuracy'])

In [60]:
history = prediction_model.fit(train_x , train_y , batch_size=32 , epochs=10 , shuffle=True)

Make predictions



In [63]:
test_predictions = prediction_model.predict(all_padded_sequences_test_f)
test_predictions_classes = np.argmax(test_predictions, axis=-1)[:-1]

Save to file for submission



In [65]:
test_ids = test_data['id']
pred_df = pd.DataFrame(data={"id":test_ids, "target":test_predictions_classes})

In [67]:
pred_df.to_csv("submission.csv", index=False)
