In [1]:
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
def hasForeignChars(token: str):
    for letter in set(token):
        if ord(letter) not in range(65, 65+26) and ord(letter) not in range(97, 97+26):
            return True
    return False

In [3]:
data = pd.read_csv('./data/train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
texts = data['text']

In [5]:
texts = texts.str.lower()

In [6]:
from nltk.corpus import stopwords

In [7]:
restr = set(stopwords.words())
updatedTexts = []

In [8]:
for text in texts:
    split = text.split()
    new_wordlist = []
    for s in split:
        if '#' in s:
            new_wordlist.append(s.strip('#'))
        if '.' in s or ',' in s:
            new_wordlist.append(s.strip('.').strip(','))
        
        if s in restr or hasForeignChars(s) or len(s) == 0:
            continue
        else:new_wordlist.append(s)
    updatedTexts.append(new_wordlist)

In [9]:
updatedTexts

[['deeds', 'reason', 'earthquake', 'allah', 'forgive'],
 ['forest', 'fire', 'ronge', 'sask', 'canada'],
 ['residents',
  'asked',
  'notified',
  'officers',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000', 'receive', 'wildfires', 'evacuation', 'orders', 'california'],
 ['photo', 'ruby', 'alaska', 'smoke', 'wildfires', 'pours', 'school'],
 ['rockyfire',
  'update',
  'california',
  'hwy',
  'closed',
  'directions',
  'due',
  'lake',
  'county',
  'fire',
  'cafire',
  'wildfires'],
 ['flood',
  'disaster',
  'heavy',
  'rain',
  'flash',
  'flooding',
  'streets',
  'manitou',
  'colorado',
  'springs',
  'areas'],
 ['top', 'hill', 'fire', 'woods'],
 ['emergency', 'evacuation', 'happening', 'building', 'street'],
 ['afraid', 'tornado', 'coming', 'area'],
 ['died', 'heat', 'wave'],
 ['haha',
  'south',
  'tampa',
  'flooded',
  'wait',
  'live',
  'south',
  'tampa',
  'gonna',
  'gonna',
  'fvck',
  'flooding'],
 ['raining',
  'flooding',
  'florida',
 

In [10]:
from gensim.models import Word2Vec

In [11]:
model = Word2Vec(updatedTexts, vector_size=200, window=2)
model.train(updatedTexts, total_examples=len(updatedTexts), epochs=100)
model.save('./w2v.model')

In [12]:
savedModel = Word2Vec.load('./w2v.model')

In [13]:
keys = set(savedModel.wv.index_to_key)

In [14]:
final_embeddings = []

for sentence in updatedTexts:
    sum = np.zeros(200)
    for token in sentence:
        if token in keys: 
            sum+=savedModel.wv[token]
        else:
            print(token)
    final_embeddings.append(sum)

deeds
forgive
ronge
sask
notified
13,000
receive
ruby
alaska
pours
rockyfire
directions
cafire
manitou
woods
tampa
flooded
tampa
fvck
raining
tampabay
tampa
count
bago
we
bago
fruits
ridiculous
skiing
looooool
way...i
cooool
wholesale
http://t.co/lhyxeohy6c
http://t.co/yao1e0xngw
africanbaze:
http://t.co/2nndbgwyei
http://t.co/qqsmshaj3n
mufc
hype
acquisitions
epl
inec
http://t.co/3imaomknna
barbados
bridgetown
elizabeth
superintende
http://t.co/wdueaj8q4j
http://t.co/roi2nsmejj
http://t.co/3tj8zjin21
http://t.co/yduixefipe
http://t.co/lxtjc87kls
nsfw
visiting
cfc
ancop
tita
vida
soooo
pumped
preaching
http://t.co/o9qknbfofx
stats
http://t.co/tiyulif5c6
http://t.co/vl5tbr3wbr
tracklist
http://t.co/roi2nsmejj
http://t.co/3tj8zjin21
http://t.co/yduixefipe
http://t.co/lxtjc87kls
nsfw
retainers
wear
least
brighton
http://t.co/gwnrhmso8k
jail
niece
http://t.co/ev1ahoucza
elizabeth
superintendent
lanford
salmon
http://t.co/vplr5hka2u
http://t.co/sxhw2tnnlf
deliberately
http://t.co/pcxarbh9an

In [15]:
len(final_embeddings)

7613

In [16]:
X = np.array(final_embeddings)
X = X/X.sum()
y = data['target'].to_numpy()

In [17]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=y)

In [18]:
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Input((xtrain.shape[1],)),
        tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(negative_slope=0.01)),
        tf.keras.layers.Dense(64, activation=tf.keras.layers.LeakyReLU(negative_slope=0.01)),
        tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(negative_slope=0.01)),
        tf.keras.layers.Dense(16, activation=tf.keras.layers.LeakyReLU(negative_slope=0.01)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ]
)

In [19]:
model.compile(optimizer='adam', loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy'])

In [20]:
model.fit(xtrain, ytrain, epochs=20, batch_size=10)

Epoch 1/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5526 - loss: 0.6869
Epoch 2/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5636 - loss: 0.6850
Epoch 3/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5687 - loss: 0.6819
Epoch 4/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6767 - loss: 0.6153
Epoch 5/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7235 - loss: 0.5667
Epoch 6/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7331 - loss: 0.5373
Epoch 7/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7504 - loss: 0.5214
Epoch 8/20
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7534 - loss: 0.5187
Epoch 9/20
[1m533/533[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x15c186c47a0>

In [21]:
model.evaluate(xtest, ytest)

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7691 - loss: 0.4944 


[0.5026125311851501, 0.7648861408233643]

(200,)