## 28-May 2022

In [1]:
import pandas as pd
import numpy as np
import regex as re
import string
import alphabet_detector
import tensorflow as tf
import sklearn
import nltk.corpus
import seaborn as sn
import matplotlib.pylab as plt
import cleantext

from cleantext import clean
from tensorflow import keras
from nltk.corpus import stopwords
from alphabet_detector import AlphabetDetector
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [30]:
print(tf.__version__)
tf.get_logger().setLevel('ERROR')

2.6.0


In [3]:
Categories = ['Disaster', 'NoDisaster']

In [4]:
pd.set_option('display.max_colwidth', None) ## Helps in reading the entire text
df = pd.read_csv('train.csv')
df.drop(columns=['id', 'keyword', 'location'], inplace=True)

In [5]:
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,"13,000 people receive #wildfires evacuation orders in California",1
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [6]:
df.isnull().sum()
# No NaN values in our text and target columns

text      0
target    0
dtype: int64

### Cleaning Functions Defenition 

In [7]:
words = set(nltk.corpus.words.words())
stop = stopwords.words('english')
def remove_symbols(text):
    pattern = r'[' + string.punctuation + ']'
    return re.sub(pattern, '', text)

def lower_case(text):
    return text.lower()

def remove_extra_spaces(text):
    text = re.sub(' +', ' ', text)
    if text[-1] == ' ':
        text = text[:-1]
    if text[0] == ' ':
        text = text[1:]
    return text

def remove_numbers(text):
    return re.sub('[0-9]', '',text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def remove_non_ASCII(text):
    return re.sub(r'[^\x00-\x7f]',r' ',text)

def clean_text(text):
    #text = re.sub('\\n', '', text)
    text = remove_numbers(text)
    text = remove_symbols(text)
    text = lower_case(text)
    text = remove_non_ASCII(text)
    text = remove_extra_spaces(text)
    text = remove_links(text)
    return text

In [8]:
# Apply the cleaning function defined above to our text
df.text=df.text.apply(lambda x:clean_text(x))

# Removing stopwords
df.text = df.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Get rid of text with zero words
df = df[df['text'].str.split().str.len().ge(1)]

In [9]:
df.head()

Unnamed: 0,text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officers evacuation shelter place orders expected,1
3,people receive wildfires evacuation orders california,1
4,got sent photo ruby alaska smoke wildfires pours school,1


In [10]:
## Information about the dataset
df_length_train = pd.DataFrame(columns=['Train Sentences Length'])
length_list_train = []
for i in range (len(df.text)):
    length_list_train.append((len(df['text'][i].split()))) 
df_length_train['Train Sentences Length'] = length_list_train
df_length_train.describe(percentiles=[.25, .5, .75, .90, .95, .99])

Unnamed: 0,Train Sentences Length
count,7613.0
mean,9.114804
std,3.502564
min,1.0
25%,6.0
50%,9.0
75%,12.0
90%,14.0
95%,15.0
99%,17.0


 Since maximum is 23, we will set our sequence length below to be 23

In [11]:
## Shuffle dataset & reset index
df=df.sample(frac=1)
df.reset_index(inplace=True, drop=True)

In [12]:
# Taking 20% of the train data for validation
train = df
n = int(len(train)*0.20)
val = train[:n]
train = train[n:]

In [13]:
print(train.shape)
print(val.shape)

(6091, 2)
(1522, 2)


### Loading Distilbert tokenizer

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [15]:
# Taking 15% 
n = int(len(train)*0.15)
val = train[:n]
train = train[n:]

In [16]:
print(train.shape)
print(val.shape)

(5178, 2)
(913, 2)


In [17]:
SEQ_LEN = 23  # we will cut/pad our sequences to a length of 23 tokens
BATCH_SIZE = 16  # we will use batches of 16

## Train Data

In [18]:
def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=SEQ_LEN,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

# initialize two arrays for input tensors
Xids = np.zeros((len(train), SEQ_LEN))
Xmask = np.zeros((len(train), SEQ_LEN))

for i, sentence in enumerate(train['text']):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)

In [19]:
arr = train['target'].values  # take sentiment column in df as array
labels = np.zeros((arr.size, arr.max()+1))  # initialize empty (all zero) label array
labels[np.arange(arr.size), arr] = 1  # add ones in indices where we have a value

In [20]:
# load arrays into tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

# create a mapping function that we use to restructure our dataset
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# using map method to apply map_func to dataset
dataset = dataset.map(map_func)

# shuffle data and batch it
dataset = dataset.shuffle(100000).batch(BATCH_SIZE)

In [21]:
train = dataset
del dataset ## To preserve memory

## Validation Data

In [22]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=SEQ_LEN,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

# initialize two arrays for input tensors
Xids = np.zeros((len(val), SEQ_LEN))
Xmask = np.zeros((len(val), SEQ_LEN))

for i, sentence in enumerate(val['text']):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)

In [23]:
arr = val['target'].values  # take sentiment column in df as array
labels = np.zeros((arr.size, arr.max()+1))  # initialize empty (all zero) label array
labels[np.arange(arr.size), arr] = 1  # add ones in indices where we have a value

In [24]:
# load arrays into tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

# create a mapping function that we use to restructure our dataset
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# using map method to apply map_func to dataset
dataset = dataset.map(map_func)

# shuffle data and batch it
dataset = dataset.shuffle(100000).batch(BATCH_SIZE)

In [25]:
val = dataset
del dataset 

## Model Creation

In [None]:
from transformers import TFAutoModel
bert = TFAutoModel.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, dropout=0.4, recurrent_dropout=0.4, activation='relu'))(embeddings)
X = tf.keras.layers.Dense(128, activation='relu')(X)
X = tf.keras.layers.Dropout(0.5)(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.3)(X)
y = tf.keras.layers.Dense(2, activation='sigmoid', name='outputs')(X)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

In [36]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 23)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 23)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_2 (TFDisti TFBaseModelOutput(la 66362880    input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 32)           100480      tf_distil_bert_model_2[0][0

In [28]:
optimizer = tf.keras.optimizers.Adam(1e-5)
loss = tf.keras.losses.CategoricalCrossentropy()  # categorical = one-hot
# acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

## Start Training

In [29]:
model.fit(train, validation_data=val, epochs=1)



<keras.callbacks.History at 0x1ebb66a6910>

In [31]:
model.fit(train, validation_data=val, epochs=1)



<keras.callbacks.History at 0x1ebb797e5e0>

In [None]:
model.fit(train, validation_data=val, epochs=1)



<keras.callbacks.History at 0x1ed282ce640>

In [43]:
def Predict_Disaster(text):
    text = tokenize(text)
    prediction = model.predict(text)
    return np.argmax(prediction)

## Test Set

In [70]:
test = pd.read_csv('test.csv')
test.drop(columns=['keyword', 'location'], inplace=True)

In [71]:
test.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, stay safe everyone."
2,3,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [72]:
# Apply the cleaning function defined above to our text
test.text=test.text.apply(lambda x:clean_text(x))

# Removing stopwords
test.text = test.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [73]:
submission = pd.DataFrame(columns=['id', 'target'])

In [74]:
submission.id = test.id

In [75]:
len(submission)

3263

In [None]:
for index, text in enumerate(test.text):
    submission.target[index] = Predict_Disaster(text)

In [77]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [78]:
submission.to_csv('DistBertWithCleanedDS.csv', index=False)