In [62]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()
from gensim.models.doc2vec import TaggedDocument



from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow import keras

In [63]:
main_data = pd.read_csv('/Users/andrewsimon/Desktop/Dow_dat.csv')
main_data

Unnamed: 0,Level,Report
0,2,"On April 10th at 12:30 PM, there was an incide..."
1,3,"On April 8th at 9:00 PM, a fire broke out in t..."
2,4,"On April 6th at 2:45 AM, there was an incident..."
3,1,"On April 5th at 7:15 PM, there was an unplanne..."
4,5,"On April 2nd at 11:30 AM, there was a minor in..."
...,...,...
295,3,"At 9:45 AM, a worker reported a small leak of ..."
296,5,"At 2:30 PM, a worker reported a small spill of..."
297,4,"At 6:15 AM, a worker reported an unusual odor ..."
298,2,"At 11:00 AM, a worker reported a small fire in..."


In [64]:
def tokenize(report):
    try:
        tokens = tokenizer.tokenize(report)
        return tokens
    except:
        return "NC"

In [65]:
def postprocess(data):
    data['tokens'] = data['Report'].progress_map(tokenize)
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

In [66]:

# data = postprocess(main_data)

In [67]:
main_data['tokens'] = main_data['Report'].apply(lambda x: gensim.utils.simple_preprocess(x))
main_data.head()

Unnamed: 0,Level,Report,tokens
0,2,"On April 10th at 12:30 PM, there was an incide...","[on, april, th, at, pm, there, was, an, incide..."
1,3,"On April 8th at 9:00 PM, a fire broke out in t...","[on, april, th, at, pm, fire, broke, out, in, ..."
2,4,"On April 6th at 2:45 AM, there was an incident...","[on, april, th, at, am, there, was, an, incide..."
3,1,"On April 5th at 7:15 PM, there was an unplanne...","[on, april, th, at, pm, there, was, an, unplan..."
4,5,"On April 2nd at 11:30 AM, there was a minor in...","[on, april, nd, at, am, there, was, minor, inc..."


In [68]:
main_data

Unnamed: 0,Level,Report,tokens
0,2,"On April 10th at 12:30 PM, there was an incide...","[on, april, th, at, pm, there, was, an, incide..."
1,3,"On April 8th at 9:00 PM, a fire broke out in t...","[on, april, th, at, pm, fire, broke, out, in, ..."
2,4,"On April 6th at 2:45 AM, there was an incident...","[on, april, th, at, am, there, was, an, incide..."
3,1,"On April 5th at 7:15 PM, there was an unplanne...","[on, april, th, at, pm, there, was, an, unplan..."
4,5,"On April 2nd at 11:30 AM, there was a minor in...","[on, april, nd, at, am, there, was, minor, inc..."
...,...,...,...
295,3,"At 9:45 AM, a worker reported a small leak of ...","[at, am, worker, reported, small, leak, of, ch..."
296,5,"At 2:30 PM, a worker reported a small spill of...","[at, pm, worker, reported, small, spill, of, c..."
297,4,"At 6:15 AM, a worker reported an unusual odor ...","[at, am, worker, reported, an, unusual, odor, ..."
298,2,"At 11:00 AM, a worker reported a small fire in...","[at, am, worker, reported, small, fire, in, th..."


In [69]:
X_train, X_test, y_train, y_test = train_test_split(np.array(main_data.tokens),
                                                    np.array(main_data.Level), test_size=0.2)

In [70]:
def labelizeReports(reports, label_type):
    labelized = []
    for i,v in tqdm(enumerate(reports)):
        label = '%s_%s'%(label_type,i)
        labelized.append(TaggedDocument(v, [label]))
    return labelized

X_train = labelizeReports(X_train, 'TRAIN')
X_test = labelizeReports(X_test, 'TEST')


240it [00:00, 1142602.68it/s]
60it [00:00, 299593.14it/s]


In [71]:
X_train[0]

TaggedDocument(words=['at', 'am', 'worker', 'reported', 'an', 'issue', 'with', 'the', 'reaction', 'unit', 'temperature', 'control', 'system', 'the', 'issue', 'was', 'resolved', 'by', 'adjusting', 'the', 'system', 'settings', 'no', 'injuries', 'or', 'environmental', 'impacts', 'were', 'reported'], tags=['TRAIN_0'])

In [72]:
tweet_w2v = Word2Vec(vector_size=200, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(X_train)])
tweet_w2v.train([x.words for x in tqdm(X_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs)

100%|██████████| 240/240 [00:00<00:00, 1691820.10it/s]
100%|██████████| 240/240 [00:00<00:00, 3322220.99it/s]


(32651, 75600)

In [73]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in X_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))




In [74]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v.wv.get_vector(word).reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [75]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in tqdm(map(lambda x: x.words, X_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in tqdm(map(lambda x: x.words, X_test))])
test_vecs_w2v = scale(test_vecs_w2v)

240it [00:00, 7070.00it/s]
60it [00:00, 7155.68it/s]


In [77]:
y_train

array([3, 3, 1, 3, 5, 5, 3, 1, 4, 5, 5, 2, 5, 3, 5, 1, 5, 2, 3, 4, 3, 3,
       3, 3, 3, 4, 1, 3, 3, 3, 2, 2, 2, 5, 2, 5, 4, 1, 5, 1, 5, 4, 1, 2,
       1, 1, 2, 4, 1, 4, 1, 5, 2, 5, 5, 3, 1, 4, 3, 4, 4, 2, 5, 2, 4, 3,
       4, 3, 4, 1, 4, 5, 3, 5, 5, 2, 5, 3, 3, 5, 2, 2, 2, 2, 2, 1, 3, 3,
       5, 4, 5, 4, 1, 5, 4, 4, 1, 5, 4, 5, 2, 1, 3, 3, 2, 4, 4, 4, 2, 3,
       3, 2, 2, 3, 1, 2, 5, 1, 2, 3, 5, 4, 1, 4, 1, 1, 3, 3, 2, 1, 3, 1,
       2, 2, 3, 2, 2, 3, 5, 3, 2, 1, 3, 2, 5, 2, 2, 4, 1, 2, 5, 5, 5, 4,
       4, 5, 5, 2, 3, 5, 5, 4, 5, 3, 3, 1, 4, 4, 5, 2, 3, 5, 3, 4, 5, 4,
       1, 3, 1, 1, 5, 5, 5, 5, 4, 4, 1, 4, 3, 4, 3, 3, 5, 2, 5, 4, 4, 4,
       5, 2, 2, 1, 4, 2, 4, 2, 3, 5, 1, 2, 4, 1, 4, 4, 1, 4, 3, 3, 1, 4,
       2, 5, 5, 5, 2, 4, 4, 5, 5, 5, 5, 4, 4, 2, 5, 5, 2, 3, 4, 4])

In [76]:
model = keras.Sequential()
model.add(keras.layers.Dense(32, activation='relu', input_dim=200))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, verbose=2)

Epoch 1/9
8/8 - 0s - loss: -7.4800e+00 - accuracy: 0.1500 - 170ms/epoch - 21ms/step
Epoch 2/9
8/8 - 0s - loss: -1.7134e+01 - accuracy: 0.1500 - 5ms/epoch - 619us/step
Epoch 3/9
8/8 - 0s - loss: -2.5574e+01 - accuracy: 0.1500 - 6ms/epoch - 695us/step
Epoch 4/9
8/8 - 0s - loss: -3.3885e+01 - accuracy: 0.1500 - 5ms/epoch - 595us/step
Epoch 5/9
8/8 - 0s - loss: -4.2512e+01 - accuracy: 0.1500 - 7ms/epoch - 818us/step
Epoch 6/9
8/8 - 0s - loss: -5.1405e+01 - accuracy: 0.1500 - 10ms/epoch - 1ms/step
Epoch 7/9
8/8 - 0s - loss: -6.1064e+01 - accuracy: 0.1500 - 7ms/epoch - 839us/step
Epoch 8/9
8/8 - 0s - loss: -7.1258e+01 - accuracy: 0.1500 - 5ms/epoch - 635us/step
Epoch 9/9
8/8 - 0s - loss: -8.2007e+01 - accuracy: 0.1500 - 5ms/epoch - 647us/step


<keras.src.callbacks.History at 0x290702c40>