In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Dense, Input, Concatenate, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
import keras

import warnings
warnings.filterwarnings('ignore')

## Purpose:

Takes all time tweets.

Model Predicts either main timezone, or mainzone split (north/south)

Approach depends on how well 2nd tier model can work with whole or split region

In [2]:
df = pd.read_csv('All_US_EPA_Tweets.csv')
df.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,Region,EPA_Region
0,10229,i gotta get you readded to bbm,Alabama,Adamsville,South,EPA_4
1,16439,ahhh yes,Alabama,Adamsville,South,EPA_4
2,17131,an old locksmith,Alabama,Adamsville,South,EPA_4
3,2648,rt they are shooting at pentagon metro please...,Alabama,Adamsville,South,EPA_4
4,2750,as a matter of fact i wanna ask about that,Alabama,Adamsville,South,EPA_4


In [3]:
col_int = 'EPA_Region'

In [4]:
df.shape

(314519, 6)

In [5]:
df[col_int].unique()

array(['EPA_4', 'EPA_9', 'EPA_6', 'EPA_8', 'EPA_1', 'EPA_3', 'EPA_10',
       'EPA_5', 'EPA_7', 'EPA_2'], dtype=object)

In [6]:
df['TweetText'] = df['TweetText'].astype(str)

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['TweetText'].tolist())

In [8]:
df2 = df.copy()

In [9]:
cols = ['Closest_City','Region']
df2 = df2.drop(cols, axis = 1)

In [10]:
one_hot = pd.get_dummies(df2[col_int])
df2 = df2.join(one_hot)
df2 = df2.drop(col_int, axis = 1)

In [11]:
df2.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,EPA_1,EPA_10,EPA_2,EPA_3,EPA_4,EPA_5,EPA_6,EPA_7,EPA_8,EPA_9
0,10229,i gotta get you readded to bbm,Alabama,False,False,False,False,True,False,False,False,False,False
1,16439,ahhh yes,Alabama,False,False,False,False,True,False,False,False,False,False
2,17131,an old locksmith,Alabama,False,False,False,False,True,False,False,False,False,False
3,2648,rt they are shooting at pentagon metro please...,Alabama,False,False,False,False,True,False,False,False,False,False
4,2750,as a matter of fact i wanna ask about that,Alabama,False,False,False,False,True,False,False,False,False,False


In [12]:
x_dat = df2[['TweetText','Timestamp']]
y_dat = df2[df[col_int].unique()]

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x_dat, y_dat, test_size = 0.25, random_state = 40)

In [14]:
x_dat["TweetText"].str.split(" ").str.len().describe(percentiles=[0.01, 0.5, 0.99])

count    314519.000000
mean         11.897326
std           7.653119
min           1.000000
1%            2.000000
50%          11.000000
99%          29.000000
max         710.000000
Name: TweetText, dtype: float64

In [15]:
tmax_length = 29

In [16]:
train_sequences = tokenizer.texts_to_sequences(x_train['TweetText'].tolist())
test_sequences = tokenizer.texts_to_sequences(x_test['TweetText'].tolist())

train_seq = pad_sequences(train_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')
test_seq = pad_sequences(test_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')

In [17]:
vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 150

In [18]:
timestamps_train = x_train['Timestamp']
timestamps_test = x_test['Timestamp']

In [21]:
input_seq = Input(shape = (tmax_length,), name = 'Input Sequence')
input_time = Input(shape = (1,), name = "Input Timestamp")

embed = Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = tmax_length)(input_seq)
convol = Conv1D(filters = 128, kernel_size = 3, activation = 'relu')(embed)
maxpool = GlobalAveragePooling1D()(convol)

concat = Concatenate()([maxpool, input_time])
dense1 = Dense(50, activation = 'relu')(concat)
dense2 = Dense(25, activation = 'relu')(dense1)
dense3 = Dense(15, activation = 'relu')(dense2)
output = Dense(len(df[col_int].unique()), activation = 'sigmoid')(dense3)

model = Model(inputs = [input_seq, input_time], outputs = output)

#metric = keras.metrics.F1Score(threshold = 0.5)
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
#model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['f1_score'])

model.summary()

In [22]:
history = model.fit([train_seq, timestamps_train], y_train, epochs = 6, batch_size = 150, validation_data = ([test_seq, timestamps_test], y_test))

Epoch 1/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 153ms/step - accuracy: 0.2055 - loss: 115.0640 - val_accuracy: 0.2482 - val_loss: 1.9830
Epoch 2/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 153ms/step - accuracy: 0.2475 - loss: 1.9705 - val_accuracy: 0.2482 - val_loss: 1.9446
Epoch 3/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 154ms/step - accuracy: 0.2464 - loss: 1.9446 - val_accuracy: 0.2482 - val_loss: 1.9389
Epoch 4/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 152ms/step - accuracy: 0.2468 - loss: 1.9409 - val_accuracy: 0.2482 - val_loss: 1.9382
Epoch 5/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 151ms/step - accuracy: 0.2466 - loss: 1.9418 - val_accuracy: 0.2483 - val_loss: 1.9395
Epoch 6/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 155ms/step - accuracy: 0.2460 - loss: 1.9426 - val_accuracy: 0.2482 - val_loss: 1.9

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title(col_int + ' Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()