In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Dense, Input, Concatenate, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

## Purpose:

Takes a timezones tweets

Model Predicts state in either subregion or overall region

In [2]:
dfa = pd.read_csv('All_US_Time_tweets.csv')

In [3]:
#df = pd.read_csv('west_time_tweets.csv')
#df = pd.read_csv('central_time_tweets.csv')
df = pd.read_csv('east_time_tweets.csv')
to_removeny = df[df['Closest_State'] == 'New York'].sample(n = 40000, random_state = 40)
to_removenj = df[df['Closest_State'] == 'New Jersey'].sample(n = 20000, random_state = 40)
df = df.drop(to_removeny.index)
df = df.drop(to_removenj.index)
#df = pd.read_csv('mountain_time_tweets.csv')

df.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,Region,Timezone,Subtime
0,55851,he threatens all of us even my boss kids n th...,Conneticut,Andover,Northeast,East,North East
1,56229,omg my whole body hurts so bad i wanna cry i,Conneticut,Andover,Northeast,East,North East
2,65414,why what,Conneticut,Andover,Northeast,East,North East
3,82042,with who bitch,Conneticut,Andover,Northeast,East,North East
4,82080,rt in hurley eating this nasty ass general ts...,Conneticut,Andover,Northeast,East,North East


In [4]:
df.shape

(215646, 7)

In [5]:
print(df['Closest_State'].unique())
print(df['Subtime'].unique())

['Conneticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Indiana' 'Kentucky' 'Maine' 'Maryland' 'Massachusetts' 'Michigan'
 'New Hampshire' 'New Jersey' 'New York' 'North Carolina' 'Ohio'
 'Pennsylvania' 'Puerto Rico' 'Rhode Island' 'South Carolina' 'Vermont'
 'Virginia' 'West Virginia']
['North East' 'South East']


In [6]:
df['TweetText'] = df['TweetText'].astype(str)
dfa['TweetText'] = dfa['TweetText'].astype(str)

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dfa['TweetText'].tolist())

In [8]:
df2 = df.copy()

In [9]:
cols = ['Closest_City','Region']
df2 = df2.drop(cols, axis = 1)

In [10]:
df2 = df2[df2['Subtime']=='North East']

In [11]:
one_hot = pd.get_dummies(df2['Closest_State'])
df2 = df2.join(one_hot)
#df2 = df2.drop('Closest_State', axis = 1)

In [12]:
df2.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Timezone,Subtime,Conneticut,Delaware,District of Columbia,Indiana,Maine,...,Massachusetts,Michigan,New Hampshire,New Jersey,New York,Ohio,Pennsylvania,Rhode Island,Vermont,West Virginia
0,55851,he threatens all of us even my boss kids n th...,Conneticut,East,North East,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,56229,omg my whole body hurts so bad i wanna cry i,Conneticut,East,North East,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,65414,why what,Conneticut,East,North East,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,82042,with who bitch,Conneticut,East,North East,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,82080,rt in hurley eating this nasty ass general ts...,Conneticut,East,North East,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
x_dat = df2[['TweetText','Timestamp']]
y_dat = df2[df2['Closest_State'].unique()]

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_dat, y_dat, test_size = 0.25, random_state = 40)

In [15]:
x_dat["TweetText"].str.split(" ").str.len().describe(percentiles=[0.01, 0.5, 0.99])

count    133127.000000
mean         11.795827
std           7.809773
min           1.000000
1%            2.000000
50%          10.000000
99%          29.000000
max         710.000000
Name: TweetText, dtype: float64

In [16]:
tmax_length = 29

In [17]:
train_sequences = tokenizer.texts_to_sequences(x_train['TweetText'].tolist())
test_sequences = tokenizer.texts_to_sequences(x_test['TweetText'].tolist())

train_seq = pad_sequences(train_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')
test_seq = pad_sequences(test_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')

In [18]:
vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 150

In [19]:
timestamps_train = x_train['Timestamp']
timestamps_test = x_test['Timestamp']

In [20]:
input_seq = Input(shape = (tmax_length,), name = 'Input Sequence')
input_time = Input(shape = (1,), name = "Input Timestamp")

embed = Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = tmax_length)(input_seq)
convol = Conv1D(filters = 128, kernel_size = 3, activation = 'relu')(embed)
maxpool = GlobalAveragePooling1D()(convol)

concat = Concatenate()([maxpool, input_time])
dense1 = Dense(50, activation = 'relu')(concat)
dense2 = Dense(25, activation = 'relu')(dense1)
dense3 = Dense(15, activation = 'relu')(dense2)
output = Dense(len(df2['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model = Model(inputs = [input_seq, input_time], outputs = output)

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
#model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['f1_score'])

model.summary()

In [22]:
history = model.fit([train_seq, timestamps_train], y_train, epochs = 6, batch_size = 50, validation_data = ([test_seq, timestamps_test], y_test))

Epoch 1/6
[1m1997/1997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 164ms/step - accuracy: 0.2600 - loss: 28.9245 - val_accuracy: 0.2731 - val_loss: 2.1765
Epoch 2/6
[1m1997/1997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 164ms/step - accuracy: 0.2701 - loss: 2.1544 - val_accuracy: 0.2731 - val_loss: 2.1278
Epoch 3/6
[1m1997/1997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 165ms/step - accuracy: 0.2744 - loss: 2.1198 - val_accuracy: 0.2731 - val_loss: 2.1197
Epoch 4/6
[1m1997/1997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 164ms/step - accuracy: 0.2727 - loss: 2.1567 - val_accuracy: 0.2731 - val_loss: 2.1182
Epoch 5/6
[1m1997/1997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6436s[0m 3s/step - accuracy: 0.2708 - loss: 2.1191 - val_accuracy: 0.2731 - val_loss: 2.1180
Epoch 6/6
[1m1997/1997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 164ms/step - accuracy: 0.2731 - loss: 2.1121 - val_accuracy: 0.2731 - val_loss: 2.1179

In [None]:
#history = model.fit([train_seq, timestamps_train], y_train, epochs = 3, batch_size = 50, validation_data = ([test_seq, timestamps_test], y_test))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()