In [1]:
import tensorflow as tf

In [4]:
tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.5)

per_process_gpu_memory_fraction: 0.5

In [5]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv('data//train.csv')

In [8]:
data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [9]:
cols_of_interest = ["id", "keyword", "location"]
data.drop(cols_of_interest, axis=1, inplace=True)

In [10]:
data_suffel = data.sample(frac=1, random_state=42)
data_suffel

Unnamed: 0,text,target
2644,So you have a new weapon that can cause un-ima...,1
2227,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,Aftershock back to school kick off was great. ...,0
6845,in response to trauma Children of Addicts deve...,0
...,...,...
5226,@Eganator2000 There aren't many Obliteration s...,0
5390,just had a panic attack bc I don't have enough...,0
860,Omron HEM-712C Automatic Blood Pressure Monito...,0
7603,Officials say a quarantine is in place at an A...,1


In [11]:
data.isnull().sum()

text      0
target    0
dtype: int64

In [12]:
data["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

# USING RNN LSTM

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data_suffel["text"],
                                                   data_suffel["target"],
                                                   test_size=0.2, random_state=42)

In [15]:
y_train

2710    0
3250    1
78      1
1621    1
2528    0
       ..
7090    1
629     0
6464    1
6265    1
3723    0
Name: target, Length: 6090, dtype: int64

In [16]:
X_train

2710    Detonation fashionable mountaineering electron...
3250    Men escape car engulfed in flames in Parley's ...
78      I-77 Mile Marker 31 to 40 South Mooresville  I...
1621    #Greece's tax revenues collapse as debt crisis...
2528    Be not afraid of sudden fear neither of the de...
                              ...                        
7090    @nytimes \nDue to upheaval created by the west...
629     70 won 70...&amp; some think possibility of my...
6464    Near them on the sand half sunk a shattered vi...
6265    kesabaran membuahkan hasil indah pada saat tep...
3723    @ScottDPierce @billharris_tv @HarrisGle @Beeze...
Name: text, Length: 6090, dtype: object

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
max_vocab_length = 20000 # max number of words to have in our vocabulary
max_length = 30 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length,
                                    pad_to_max_tokens=True)

In [19]:
text_vectorizer.adapt(X_train)

In [20]:
X_train[2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [21]:
text_vectorizer(X_train[2])

<tf.Tensor: shape=(30,), dtype=int64, numpy=
array([  41, 1842, 2027,    5, 1824,    4,  591,   22,  120, 9518,   18,
       1655,   38,  526,  220,   53, 1824,    4,  591, 1160,   22, 1300,
          0,    0,    0,    0,    0,    0,    0,    0], dtype=int64)>

In [22]:
len(text_vectorizer.get_vocabulary())

19328

In [23]:
text_vectorizer(X_train)

<tf.Tensor: shape=(6090, 30), dtype=int64, numpy=
array([[  572,  3567,  3340, ...,     0,     0,     0],
       [  595,  1047,   111, ...,     0,     0,     0],
       [ 2254,  1665,  3373, ...,     0,     0,     0],
       ...,
       [  228,    85,    11, ...,     0,     0,     0],
       [10764, 10079, 15285, ...,     0,     0,     0],
       [ 8064, 18050, 15295, ...,     0,     0,     0]], dtype=int64)>

In [24]:
X_train

2710    Detonation fashionable mountaineering electron...
3250    Men escape car engulfed in flames in Parley's ...
78      I-77 Mile Marker 31 to 40 South Mooresville  I...
1621    #Greece's tax revenues collapse as debt crisis...
2528    Be not afraid of sudden fear neither of the de...
                              ...                        
7090    @nytimes \nDue to upheaval created by the west...
629     70 won 70...&amp; some think possibility of my...
6464    Near them on the sand half sunk a shattered vi...
6265    kesabaran membuahkan hasil indah pada saat tep...
3723    @ScottDPierce @billharris_tv @HarrisGle @Beeze...
Name: text, Length: 6090, dtype: object

In [25]:
from tensorflow.keras import layers

In [26]:
tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=64,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding")

In [27]:
def model_DISASTER():
    inputs = layers.Input(shape=(1,), dtype="string")
    x = text_vectorizer(inputs)
    x = embedding(x)
    x = layers.LSTM(64)(x)
    x = layers.Dense(1, activation = "sigmoid")(x)
    return tf.keras.Model(inputs, x)
    
    

In [28]:
Model = model_DISASTER()

In [29]:
Model.compile(loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])

In [30]:
Model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 30)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 30, 64)            1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1,313,089
Trainable params: 1,313,089
Non-trainable params: 0
___________________________________________________

In [31]:
epochs = 2

In [32]:
Model_history = Model.fit(X_train,
                          y_train,
                          epochs=epochs,
                          validation_data=(X_test, y_test),
                          verbose = 1,
                         batch_size= 32)

Epoch 1/2
Epoch 2/2


# USING NORMAL ML MODELS

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline


model_with_ML = Pipeline([
                    ("tfidf", TfidfVectorizer()), 
                    ("clf", LinearSVC()) 
])


model_with_ML.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [127]:
model_with_ML.score(X_test, y_test)

0.7984241628365069