<a href="https://colab.research.google.com/github/Eltonomwega/ML_Notebooks/blob/main/Group_ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Twitter disaster prediction

This note book aims to predict whether a tweet is a about a real world disaster or not based of off the words used by in the tweet.

The dataset can be found found on kaggle -> https://www.kaggle.com/c/nlp-getting-started/data



In [61]:
import spacy
import pandas as pd
import numpy as np
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.models import load_model
import string

In [62]:
data = pd.read_csv('/content/drive/MyDrive/machine learning/train.csv')

train_data, val_data, test_data = \
              np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])

In [63]:
train_data

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0
...,...,...,...,...,...
484,700,attacked,"Los Angeles, CA",@envw98 @NickCoCoFree @JulieDiCaro @jdabe80 I ...,0
2211,3167,deluge,Melbourne-ish,Despite the deluge of #FantasticFour notices o...,0
4710,6697,landslide,,@Morning_Joe @Reince @PressSec Joe ur so smart...,0
2555,3663,destroy,New York City,Watch These Super Strong Magnets Destroy Every...,1


In [64]:
train_data.isna().any()

id          False
keyword      True
location     True
text        False
target      False
dtype: bool

In [65]:
## get the balance of the dataset 1 is true for a disaster while 0 is false
train_data['target'].value_counts()

0    2609
1    1958
Name: target, dtype: int64

In [66]:
## remove punctuation from the tweets
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [67]:
train_data['text']= train_data['text'].apply(lambda x:remove_punctuation(x))

In [68]:
## convert to all lower case 
train_data['text']= train_data['text'].apply(lambda x: x.lower())

In [69]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,so you have a new weapon that can cause unimag...,1
2227,3185,deluge,,the famping things i do for gishwhes just got ...,0
5448,7769,police,UK,dt georgegalloway rt galloway4mayor ûïthe col...,1
132,191,aftershock,,aftershock back to school kick off was great i...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma children of addicts deve...,0


### Tokenizing the dataset

In [70]:
x_tr,y_tr = train_data['text'].values,pd.get_dummies(train_data['target'].values).values
x_val,y_val=val_data['text'].values,pd.get_dummies(val_data['target'].values).values
x_test,y_test=test_data['text'].values,pd.get_dummies(test_data['target'].values).values

In [71]:
x_tr

array(['so you have a new weapon that can cause unimaginable destruction',
       'the famping things i do for gishwhes just got soaked in a deluge going for pads and tampons thx mishacollins ',
       'dt georgegalloway rt galloway4mayor \x89ûïthe col police can catch a pickpocket in liverpool stree httptcovxin1goq4q',
       ...,
       'morningjoe reince presssec joe ur so smart u should run 4 president ur perfect the american people love assholes ud win by a landslide',
       'watch these super strong magnets destroy everyday objects httptcobtus5jejuy httptcozrtfxluk6r',
       'im on top of the hill and i can see a fire in the woods'],
      dtype=object)

In [75]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(x_tr))

In [76]:
x_tr_seq=tokenizer.texts_to_sequences(x_tr)

In [77]:
x_val_seq=tokenizer.texts_to_sequences(x_val)
x_test_seq=tokenizer.texts_to_sequences(x_test)

In [78]:
SEQ_LEN = 100
x_tr_seq=pad_sequences(x_tr_seq,maxlen=SEQ_LEN)
x_val_seq=pad_sequences(x_val_seq,maxlen=SEQ_LEN)
x_test_seq=pad_sequences(x_test_seq,maxlen=SEQ_LEN)

### Training Model

In [80]:
model=Sequential()
model.add(Embedding(len(tokenizer.word_index)+1,300,input_length=SEQ_LEN,trainable=True))
model.add(LSTM(128,return_sequences=True,dropout=0.2))
model.add(GlobalMaxPool1D())
model.add(Dense(64,activation='relu'))
model.add(Dense(2,activation='softmax'))

In [81]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
es=EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=3)
mc=ModelCheckpoint('best_model.h5',monitor='val_acc',mode='max',save_best_only=True,verbose=1)

In [82]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          4721400   
_________________________________________________________________
lstm (LSTM)                  (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 4,949,434
Trainable params: 4,949,434
Non-trainable params: 0
_________________________________________________________________
None


In [84]:
history=model.fit(x_tr_seq,y_tr,batch_size=len(train_data),epochs=10,validation_data=(x_val_seq,y_val),verbose=1,callbacks=[es,mc],validation_steps=1)

Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.56730, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.56730
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.56730
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.56730
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.56730
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.56730
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.56730
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.56730
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.56730
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.56730


In [85]:
history2=model.fit(x_tr_seq,y_tr,batch_size=100,epochs=10,validation_data=(x_val_seq,y_val),verbose=1,callbacks=[es,mc],validation_steps=1)

Epoch 1/10

Epoch 00001: val_acc improved from 0.56730 to 0.82000, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.82000
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.82000
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.82000
Epoch 00004: early stopping


### Testing model accuracy

In [86]:
model=load_model('best_model.h5')

In [87]:
_,val_acc=model.evaluate(x_val_seq,y_val,batch_size=100)
print(val_acc)

0.7925148010253906
