# Import Libraries

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import string, nltk
import re
from nltk.tokenize import word_tokenize
punct = nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemma=WordNetLemmatizer()
stemm = PorterStemmer()
from nltk.tokenize import WordPunctTokenizer
tokenize = WordPunctTokenizer()

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Data Reading

In [None]:
# Read Dataset
read_data = pd.read_csv("/content/train.csv")
print(read_data.shape)

(7613, 5)


In [None]:
data = read_data.drop(['id', 'keyword', 'location'], axis=1)
data.shape

(7613, 2)

In [None]:
data.isnull().sum()
data = data.drop_duplicates()

In [None]:
label_data = data.drop(['text'], axis=1)
train_data = data['text']
print(train_data.shape)
print(label_data.shape)

(7521,)
(7521, 1)


# Clean Data

In [None]:
def lower(text):
  return str(text.lower())

def remove_numbers(text):
  return re.sub('\d+', '', text)

def remove_html_tags(text):
  return re.sub('\[.*?\]', '', text)

def remove_url(text):
  return re.sub('https?://\S+|www\.\S+', '', text)

def remove_punctuations(text):
  return re.sub('[%s]' % re.escape(string.punctuation),'',text)

def remove_stop_words(text):
  return ' '.join([word for word in text if word.lower() not in stop_words])

def lemmatize_data(text):
  text = ' '.join(stemm.stem(word) for word in text.split(' '))
  text = ' '.join(lemma.lemmatize(word) for word in text.split(' '))
  return text

def wordTokenize(text):
  text = tokenize.tokenize(text)
  return text

In [None]:
def preprocess(text):
  text = lower(text)
  text = remove_numbers(text)
  text = remove_html_tags(text)
  text = remove_url(text)
  text = remove_punctuations(text)
  text = wordTokenize(text)
  text = remove_stop_words(text)
  text = lemmatize_data(text)
  return text

train_data = train_data.apply(preprocess)
train_data

0                deed reason earthquak may allah forgiv u
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3             peopl receiv wildfir evacu order california
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7604    worldnew fallen powerlin glink tram updat fire...
7605    flip side im walmart bomb everyon evacu stay t...
7606    suicid bomber kill saudi secur site mosqu reut...
7608       two giant crane hold bridg collaps nearbi home
7612    latest home raze northern california wildfir a...
Name: text, Length: 7521, dtype: object

# Long Short-Term Memory

Regards to the embedding concept. I think it suitable to use it in the deep learning algorithms instead of TF-IDF which used in the machine learning algorithms.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Tokenize the texts
max_features=3000
tokenizer=Tokenizer(num_words=max_features,split=' ')
tokenizer.fit_on_texts(train_data.values)
X = tokenizer.texts_to_sequences(train_data.values)
X = pad_sequences(X)      # to make the whole input text data on the same size.

In [None]:
tokenizer.sequences_to_texts([[ 713,  154,   56, 1434,   14]])

['interest god car amaz peopl']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, label_data, test_size = 0.3, random_state =0)

In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.models import Model, Sequential

max_features = 30000
embed_dim = 32

lstm_model = Sequential()
lstm_model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1]))
#lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=60, input_shape=(X_train.shape[1], 1), activation='relu', return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss = 'binary_crossentropy', optimizer='adam' , metrics = ['accuracy'])

In [None]:
lstm_model.fit(X_train, y_train, epochs = 10, batch_size=1, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa7cf1ca8e0>

In [None]:
y_pred = lstm_model.predict(X_test).round()
# Final evaluation of the model
scores = lstm_model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 76.81%


In [None]:
from sklearn import metrics
train_accuracy = round(metrics.accuracy_score(y_train, lstm_model.predict(X_train).round())*100)
train_accuracy



95

In [None]:
accuracy = round(accuracy_score(y_test,y_pred),3)
precision = round(precision_score(y_test,y_pred,average='weighted'),3)
recall = round(recall_score(y_test,y_pred,average='weighted'),3)

print(f'Accuracy of the model: {np.round(accuracy*100,2)}%')
print(f'Precision Score of the model: {np.round(precision*100,2)}%')
print(f'Recall Score of the model: {np.round(recall*100,2)}%')
print('-'*50)
print(classification_report(y_test,y_pred))

Accuracy of the model: 76.8%
Precision Score of the model: 76.7%
Recall Score of the model: 76.8%
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       856
           1       0.75      0.70      0.72       649

    accuracy                           0.77      1505
   macro avg       0.76      0.76      0.76      1505
weighted avg       0.77      0.77      0.77      1505



# Evaluation the model

In [None]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
data = test_data.drop(['keyword', 'location'], axis=1)
data.shape
id = data['id']
id = id.to_frame()
id = pd.DataFrame(id)
id

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11
...,...
3258,10861
3259,10865
3260,10868
3261,10874


In [None]:
data.isnull().sum()
data = data.drop_duplicates()

In [None]:
test_data = test_data['text'].apply(preprocess)
test_data

0                                happen terribl car crash
1           heard earthquak differ citi stay safe everyon
2       forest fire spot pond gee flee across street c...
3                          apocalyps light spokan wildfir
4                      typhoon soudelor kill china taiwan
                              ...                        
3258    earthquak safeti lo angel  ûò safeti fasten xrwn
3259    storm ri wors last hurrican cityampoth hardest...
3260                            green line derail chicago
3261                  meg issu hazard weather outlook hwo
3262      cityofcalgari activ municip emerg plan yycstorm
Name: text, Length: 3263, dtype: object

In [None]:
test_token = tokenizer.texts_to_sequences(test_data.values)
test_token = pad_sequences(test_token, maxlen =1)  # to make the whole text with the same size.

In [None]:
final_pred = lstm_model.predict(test_token)



In [None]:
final_pred = np.round(final_pred).astype(int)
final_pred

array([[1],
       [0],
       [1],
       ...,
       [1],
       [1],
       [0]])

In [None]:
id = pd.DataFrame(id, columns=['id'])
final_pred = pd.DataFrame(final_pred, columns=['target'])
submission = pd.concat([id, final_pred], axis=1, join='inner')
submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
submission.to_csv('submission.csv', index=False)