In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Dense, Input, Concatenate, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
import keras

import warnings
warnings.filterwarnings('ignore')

## Purpose:

Takes all time tweets.

Model Predicts either main timezone, or mainzone split (north/south)

Approach depends on how well 2nd tier model can work with whole or split region

In [2]:
df = pd.read_csv('All_US_Time_Tweets.csv')
df.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,Region,Timezone,Subtime
0,10229,i gotta get you readded to bbm,Alabama,Adamsville,Deep South,Central,South Central
1,16439,ahhh yes,Alabama,Adamsville,Deep South,Central,South Central
2,17131,an old locksmith,Alabama,Adamsville,Deep South,Central,South Central
3,2648,rt they are shooting at pentagon metro please...,Alabama,Adamsville,Deep South,Central,South Central
4,2750,as a matter of fact i wanna ask about that,Alabama,Adamsville,Deep South,Central,South Central


In [3]:
col_int = 'Timezone'

In [4]:
df.shape

(374519, 7)

In [5]:
to_removeny = df[df['Closest_State'] == 'New York'].sample(n = 40000, random_state = 40)
to_removenj = df[df['Closest_State'] == 'New Jersey'].sample(n = 20000, random_state = 40)
df = df.drop(to_removeny.index)
df = df.drop(to_removenj.index)

In [6]:
df[col_int].unique()

array(['Central', 'Mountain', 'West', 'East'], dtype=object)

In [7]:
df['TweetText'] = df['TweetText'].astype(str)

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['TweetText'].tolist())

In [9]:
df2 = df.copy()

In [10]:
cols = ['Closest_City','Region']
df2 = df2.drop(cols, axis = 1)

In [11]:
one_hot = pd.get_dummies(df2[col_int])
df2 = df2.join(one_hot)
df2 = df2.drop(col_int, axis = 1)

In [12]:
df2.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Subtime,Central,East,Mountain,West
0,10229,i gotta get you readded to bbm,Alabama,South Central,True,False,False,False
1,16439,ahhh yes,Alabama,South Central,True,False,False,False
2,17131,an old locksmith,Alabama,South Central,True,False,False,False
3,2648,rt they are shooting at pentagon metro please...,Alabama,South Central,True,False,False,False
4,2750,as a matter of fact i wanna ask about that,Alabama,South Central,True,False,False,False


In [13]:
x_dat = df2[['TweetText','Timestamp']]
y_dat = df2[df[col_int].unique()]

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_dat, y_dat, test_size = 0.25, random_state = 40)

In [15]:
x_dat["TweetText"].str.split(" ").str.len().describe(percentiles=[0.01, 0.5, 0.99])

count    314519.000000
mean         11.897326
std           7.653119
min           1.000000
1%            2.000000
50%          11.000000
99%          29.000000
max         710.000000
Name: TweetText, dtype: float64

In [16]:
tmax_length = 29

In [17]:
train_sequences = tokenizer.texts_to_sequences(x_train['TweetText'].tolist())
test_sequences = tokenizer.texts_to_sequences(x_test['TweetText'].tolist())

train_seq = pad_sequences(train_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')
test_seq = pad_sequences(test_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')

In [18]:
vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 150

In [19]:
timestamps_train = x_train['Timestamp']
timestamps_test = x_test['Timestamp']

In [22]:
input_seq = Input(shape = (tmax_length,), name = 'Input Sequence')
input_time = Input(shape = (1,), name = "Input Timestamp")

embed = Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = tmax_length)(input_seq)
convol = Conv1D(filters = 128, kernel_size = 3, activation = 'relu')(embed)
maxpool = GlobalAveragePooling1D()(convol)

concat = Concatenate()([maxpool, input_time])
dense1 = Dense(50, activation = 'relu')(concat)
dense2 = Dense(25, activation = 'relu')(dense1)
dense3 = Dense(15, activation = 'relu')(dense2)
output = Dense(len(df[col_int].unique()), activation = 'sigmoid')(dense3)

model = Model(inputs = [input_seq, input_time], outputs = output)

#metric = keras.metrics.F1Score(threshold = 0.5)
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
#model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['f1_score'])

model.summary()

In [23]:
history = model.fit([train_seq, timestamps_train], y_train, epochs = 6, batch_size = 150, validation_data = ([test_seq, timestamps_test], y_test))

Epoch 1/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 158ms/step - accuracy: 0.5273 - loss: 59.2897 - val_accuracy: 0.6854 - val_loss: 6.8380
Epoch 2/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 159ms/step - accuracy: 0.5462 - loss: 10.0437 - val_accuracy: 0.6406 - val_loss: 5.3852
Epoch 3/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 159ms/step - accuracy: 0.5651 - loss: 4.5080 - val_accuracy: 0.4997 - val_loss: 1.9516
Epoch 4/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 157ms/step - accuracy: 0.6067 - loss: 1.8303 - val_accuracy: 0.6798 - val_loss: 1.2357
Epoch 5/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 153ms/step - accuracy: 0.6509 - loss: 1.3048 - val_accuracy: 0.6584 - val_loss: 1.5915
Epoch 6/6
[1m1573/1573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 156ms/step - accuracy: 0.6767 - loss: 1.0614 - val_accuracy: 0.4790 - val_loss: 1.2

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title(col_int + ' Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()