In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Dense, Input, Concatenate, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('All_US_tweets.csv')
df.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,Region
0,10229,i gotta get you readded to bbm,Alabama,Adamsville,South
1,16439,ahhh yes,Alabama,Adamsville,South
2,17131,an old locksmith,Alabama,Adamsville,South
3,2648,rt they are shooting at pentagon metro please...,Alabama,Adamsville,South
4,2750,as a matter of fact i wanna ask about that,Alabama,Adamsville,South


In [3]:
df.shape

(374519, 5)

In [4]:
df['Region'].unique()

array(['South', 'Southwest', 'West Coast', 'Rockies', 'Northeast',
       'Midwest', 'NonCont'], dtype=object)

In [None]:
West_Coast = ('California','Oregon', 'Washington')
Southwest = ('Arizona','New Mexico','Oklahoma','Texas')
Rockies = ('Nevada','Utah','Colorado','Wyoming','Idaho','Montana')
Midwest = ('North Dakota','South Dakota','Nebraska','Kansas','Missouri','Iowa','Minnesota')
East_Midwest = ('Wisconsin','Illinois','Indiana','Michigan','Ohio')
South = ('Tennessee','Kentucky','North Carolina', 'Virginia','West Virginia','Maryland','Delaware', 'District of Columbia')
Deep_South = ('Arkansas','Louisiana','Mississippi','Alabama','Georgia','Florida','South Carolina',)
Northeast = ('Pennsylvania','New Jersey','New York','Massachusetts', 'Rhode Island','Conneticut','Vermont','New Hampshire','Maine')
NonCont = ('Hawaii','Alaska', 'Puerto Rico')

In [5]:
df['TweetText'] = df['TweetText'].astype(str)

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['TweetText'].tolist())

In [7]:
df2 = df.copy()

In [8]:
cols = ['Closest_City']
df2 = df2.drop(cols, axis = 1)

In [9]:
one_hot = pd.get_dummies(df2['Region'])
df2 = df2.join(one_hot)
df2 = df2.drop('Region', axis = 1)

In [10]:
df2.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Midwest,NonCont,Northeast,Rockies,South,Southwest,West Coast
0,10229,i gotta get you readded to bbm,Alabama,0,0,0,0,1,0,0
1,16439,ahhh yes,Alabama,0,0,0,0,1,0,0
2,17131,an old locksmith,Alabama,0,0,0,0,1,0,0
3,2648,rt they are shooting at pentagon metro please...,Alabama,0,0,0,0,1,0,0
4,2750,as a matter of fact i wanna ask about that,Alabama,0,0,0,0,1,0,0


In [11]:
x_dat = df2[['TweetText','Timestamp']]
y_dat = df2[df['Region'].unique()]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_dat, y_dat, test_size = 0.25, random_state = 40)

In [13]:
x_dat["TweetText"].str.split(" ").str.len().describe(percentiles=[0.01, 0.5, 0.99])

count    374519.000000
mean         11.836540
std           7.578886
min           1.000000
1%            2.000000
50%          10.000000
99%          29.000000
max         710.000000
Name: TweetText, dtype: float64

In [14]:
tmax_length = 29

In [15]:
train_sequences = tokenizer.texts_to_sequences(x_train['TweetText'].tolist())
test_sequences = tokenizer.texts_to_sequences(x_test['TweetText'].tolist())

train_seq = pad_sequences(train_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')
test_seq = pad_sequences(test_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')

In [16]:
vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 150

In [17]:
timestamps_train = x_train['Timestamp']
timestamps_test = x_test['Timestamp']

In [18]:
input_seq = Input(shape = (tmax_length,), name = 'Input Sequence')
input_time = Input(shape = (1,), name = "Input Timestamp")

embed = Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = tmax_length)(input_seq)
convol = Conv1D(filters = 128, kernel_size = 3, activation = 'relu')(embed)
maxpool = GlobalAveragePooling1D()(convol)

concat = Concatenate()([maxpool, input_time])
dense1 = Dense(50, activation = 'relu')(concat)
dense2 = Dense(25, activation = 'relu')(dense1)
dense3 = Dense(15, activation = 'relu')(dense2)
output = Dense(len(df['Region'].unique()), activation = 'sigmoid')(dense3)

model = Model(inputs = [input_seq, input_time], outputs = output)

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
#model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input Sequence (InputLayer)     [(None, 29)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 29, 150)      27806850    Input Sequence[0][0]             
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 27, 128)      57728       embedding[0][0]                  
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 128)          0           conv1d[0][0]                     
______________________________________________________________________________________________

In [19]:
history = model.fit([train_seq, timestamps_train], y_train, epochs = 5, batch_size = 150, validation_data = ([test_seq, timestamps_test], y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()