In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Dense, Input, Concatenate, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

## Purpose:

2 tier CNN

First tier predicts timezone

Second tier predicts state inside indicated timezone from tier 1


In [2]:
dfa = pd.read_csv('All_US_Time_tweets.csv')

In [3]:
df1 = pd.read_csv('west_time_tweets.csv')
to_removeca = df1[df1['Closest_State'] == 'California'].sample(n = 10000, random_state = 40)
df1 = df1.drop(to_removeca.index)


df2 = pd.read_csv('central_time_tweets.csv')

df3 = pd.read_csv('east_time_tweets.csv')
to_removeny = df3[df3['Closest_State'] == 'New York'].sample(n = 40000, random_state = 40)
to_removenj = df3[df3['Closest_State'] == 'New Jersey'].sample(n = 20000, random_state = 40)
df3 = df3.drop(to_removeny.index)
df3 = df3.drop(to_removenj.index)

df4 = pd.read_csv('mountain_time_tweets.csv')

In [4]:
dfa['TweetText'] = dfa['TweetText'].astype(str)
df1['TweetText'] = df1['TweetText'].astype(str)
df2['TweetText'] = df2['TweetText'].astype(str)
df3['TweetText'] = df3['TweetText'].astype(str)
df4['TweetText'] = df4['TweetText'].astype(str)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dfa['TweetText'].tolist())

to_removeny = dfa[dfa['Closest_State'] == 'New York'].sample(n = 40000, random_state = 40)
to_removenj = dfa[dfa['Closest_State'] == 'New Jersey'].sample(n = 20000, random_state = 40)
to_removeca = dfa[dfa['Closest_State'] == 'California'].sample(n = 10000, random_state = 40)
dfa = dfa.drop(to_removeny.index)
dfa = dfa.drop(to_removenj.index)
dfa = dfa.drop(to_removeca.index)

In [6]:
one_hot = pd.get_dummies(dfa['Timezone'])
dfa = dfa.join(one_hot)

In [7]:
def make_onehot(df):
    one_hot = pd.get_dummies(df['Closest_State'])
    df = df.join(one_hot)
    #df = df.drop('Closest_State', axis = 1)
    return df

In [8]:
df1 = make_onehot(df1)
df2 = make_onehot(df2)
df3 = make_onehot(df3)
df4 = make_onehot(df4)

In [9]:
x_data = dfa[['TweetText','Timestamp']]
y_data = dfa[dfa['Timezone'].unique()]

x_dat1 = df1[['TweetText','Timestamp']]
y_dat1 = df1[df1['Closest_State'].unique()]

x_dat2 = df2[['TweetText','Timestamp']]
y_dat2 = df2[df2['Closest_State'].unique()]

x_dat3 = df3[['TweetText','Timestamp']]
y_dat3 = df3[df3['Closest_State'].unique()]

x_dat4 = df4[['TweetText','Timestamp']]
y_dat4 = df4[df4['Closest_State'].unique()]

In [10]:
x_traina, x_testa, y_traina, y_testa = train_test_split(x_data, y_data, test_size = 0.25, random_state = 40)
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_dat1, y_dat1, test_size = 0.25, random_state = 40)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_dat2, y_dat2, test_size = 0.25, random_state = 40)
x_train3, x_test3, y_train3, y_test3 = train_test_split(x_dat3, y_dat3, test_size = 0.25, random_state = 40)
x_train4, x_test4, y_train4, y_test4 = train_test_split(x_dat4, y_dat4, test_size = 0.25, random_state = 40)

In [11]:
x_data["TweetText"].str.split(" ").str.len().describe(percentiles=[0.01, 0.5, 0.99])

count    304519.000000
mean         11.894046
std           7.584947
min           1.000000
1%            2.000000
50%          11.000000
99%          29.000000
max         710.000000
Name: TweetText, dtype: float64

In [12]:
tmax_length = 29

In [13]:
def get_seq(x_train, x_test):
    train_sequences = tokenizer.texts_to_sequences(x_train['TweetText'].tolist())
    test_sequences = tokenizer.texts_to_sequences(x_test['TweetText'].tolist())
    
    train_seq = pad_sequences(train_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')
    test_seq = pad_sequences(test_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')

    return train_seq, test_seq

In [14]:
train_seqa, test_seqa = get_seq(x_traina, x_testa)
train_seq1, test_seq1 = get_seq(x_train1, x_test1)
train_seq2, test_seq2 = get_seq(x_train2, x_test2)
train_seq3, test_seq3 = get_seq(x_train3, x_test3)
train_seq4, test_seq4 = get_seq(x_train4, x_test4)

In [15]:
vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 150

In [16]:
timestamps_traina = x_traina['Timestamp']
timestamps_testa = x_testa['Timestamp']

timestamps_train1 = x_train1['Timestamp']
timestamps_test1 = x_test1['Timestamp']

timestamps_train2 = x_train2['Timestamp']
timestamps_test2 = x_test2['Timestamp']

timestamps_train3 = x_train3['Timestamp']
timestamps_test3 = x_test3['Timestamp']

timestamps_train4 = x_train4['Timestamp']
timestamps_test4 = x_test4['Timestamp']

In [18]:
input_seq = Input(shape = (tmax_length,), name = 'Input Sequence')
input_time = Input(shape = (1,), name = "Input Timestamp")

embed = Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = tmax_length)(input_seq)
convol = Conv1D(filters = 128, kernel_size = 3, activation = 'relu')(embed)
maxpool = GlobalAveragePooling1D()(convol)

concat = Concatenate()([maxpool, input_time])
dense1 = Dense(50, activation = 'relu')(concat)
dense2 = Dense(25, activation = 'relu')(dense1)
dense3 = Dense(15, activation = 'relu')(dense2)
output = Dense(len(dfa['Timezone'].unique()), activation = 'sigmoid')(dense3)

modela = Model(inputs = [input_seq, input_time], outputs = output)
modela.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df1['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model1 = Model(inputs = [input_seq, input_time], outputs = output)
model1.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df2['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model2 = Model(inputs = [input_seq, input_time], outputs = output)
model2.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df3['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model3 = Model(inputs = [input_seq, input_time], outputs = output)
model3.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df4['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model4 = Model(inputs = [input_seq, input_time], outputs = output)
model4.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

#model.summary()

In [19]:
modela.fit([train_seqa, timestamps_traina], y_traina, epochs = 3, batch_size = 150, validation_data = ([test_seqa, timestamps_testa], y_testa))

Epoch 1/3
[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 163ms/step - accuracy: 0.5853 - loss: 193.0028 - val_accuracy: 0.7096 - val_loss: 0.9043
Epoch 2/3
[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 162ms/step - accuracy: 0.7092 - loss: 0.8873 - val_accuracy: 0.7096 - val_loss: 0.8606
Epoch 3/3
[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 161ms/step - accuracy: 0.7072 - loss: 0.8620 - val_accuracy: 0.7096 - val_loss: 0.8534


<keras.src.callbacks.history.History at 0x1d3edec87a0>

In [27]:
model1.fit([train_seq1, timestamps_train1], y_train1, epochs = 3, batch_size = 50, validation_data = ([test_seq1, timestamps_test1], y_test1))

Epoch 1/3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 155ms/step - accuracy: 0.8095 - loss: 1.2734 - val_accuracy: 0.8138 - val_loss: 0.9921
Epoch 2/3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 156ms/step - accuracy: 0.8184 - loss: 0.9282 - val_accuracy: 0.8138 - val_loss: 0.7991
Epoch 3/3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 156ms/step - accuracy: 0.8059 - loss: 0.7860 - val_accuracy: 0.8138 - val_loss: 0.7189


<keras.src.callbacks.history.History at 0x1d3ed69d1c0>

In [29]:
model2.fit([train_seq2, timestamps_train2], y_train2, epochs = 3, batch_size = 50, validation_data = ([test_seq2, timestamps_test2], y_test2))

Epoch 1/3
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 156ms/step - accuracy: 0.3358 - loss: 2.6329 - val_accuracy: 0.3473 - val_loss: 2.3315
Epoch 2/3
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 156ms/step - accuracy: 0.3502 - loss: 2.2711 - val_accuracy: 0.3473 - val_loss: 2.1717
Epoch 3/3
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 156ms/step - accuracy: 0.3493 - loss: 2.1488 - val_accuracy: 0.3473 - val_loss: 2.1266


<keras.src.callbacks.history.History at 0x1d3ed9436b0>

In [31]:
model3.fit([train_seq3, timestamps_train3], y_train3, epochs = 3, batch_size = 150, validation_data = ([test_seq3, timestamps_test3], y_test3))

Epoch 1/3
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 159ms/step - accuracy: 0.1687 - loss: 2.9033 - val_accuracy: 0.1670 - val_loss: 2.6745
Epoch 2/3
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 159ms/step - accuracy: 0.1700 - loss: 2.6479 - val_accuracy: 0.1670 - val_loss: 2.6077
Epoch 3/3
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 160ms/step - accuracy: 0.1703 - loss: 2.5962 - val_accuracy: 0.1670 - val_loss: 2.5925


<keras.src.callbacks.history.History at 0x1d3eeb69340>

In [33]:
model4.fit([train_seq4, timestamps_train4], y_train4, epochs = 3, batch_size = 50, validation_data = ([test_seq4, timestamps_test4], y_test4))

Epoch 1/3
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 157ms/step - accuracy: 0.6933 - loss: 1.9345 - val_accuracy: 0.6836 - val_loss: 1.8393
Epoch 2/3
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 156ms/step - accuracy: 0.7041 - loss: 1.8176 - val_accuracy: 0.6836 - val_loss: 1.7377
Epoch 3/3
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 156ms/step - accuracy: 0.7026 - loss: 1.7074 - val_accuracy: 0.6836 - val_loss: 1.6488


<keras.src.callbacks.history.History at 0x1d3f6cf0110>

In [26]:
text = train_seqa[0:1]
time = timestamps_traina[0:1]
modela.predict([text, time])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step


array([[0.50132203, 0.1314287 , 0.363014  , 0.8041143 ]], dtype=float32)