In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Dense, Input, Concatenate, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

## Purpose:

2 tier CNN

First tier predicts timezone

Second tier predicts state inside indicated timezone from tier 1


In [2]:
# Minimizes class disparity while stil allowing some variation between tweets from different states
def equalify(df, rstate):
    vals = df['Closest_State'].unique()
    samples = []
    
    for val in vals:
        filter_df = df[df['Closest_State'] == val]
        if len(filter_df) >= 1500:
            sample = filter_df.sample(n=1500, replace = False, random_state = rstate)
        else:
            sample = filter_df
        samples.append(sample)

    combined = pd.concat(samples)
    combined = combined.reset_index(drop = True)
    return combined

In [3]:
dfa = pd.read_csv('All_US_Time_tweets.csv')
dfa.shape

(374519, 7)

In [4]:
df1 = pd.read_csv('west_time_tweets.csv')
df1 = equalify(df1, 100)

df2 = pd.read_csv('central_time_tweets.csv')
df2 = equalify(df2, 100)

df3 = pd.read_csv('east_time_tweets.csv')
df3 = equalify(df3, 100)

df4 = pd.read_csv('mountain_time_tweets.csv')
df4 = equalify(df4, 100)

In [5]:
dfa['Closest_State'].value_counts()

Closest_State
New York                76393
New Jersey              47047
California              34634
Georgia                 25966
Florida                 20181
Texas                   18330
Maryland                18035
Virginia                15679
Ohio                    13774
North Carolina          13328
Illinois                 9478
Michigan                 9456
Pennsylvania             8949
South Carolina           5648
Conneticut               5549
Tennessee                5295
Massachusetts            4148
Arizona                  4136
Louisiana                4051
Alabama                  3136
Indiana                  2970
Washington               2963
Mississippi              2772
District of Columbia     2595
Missouri                 2568
Nevada                   1806
Minnesota                1567
Kentucky                 1369
Wisconsin                1324
Oklahoma                 1259
Rhode Island             1200
Delaware                 1159
Arkansas                 1

In [6]:
dfa['TweetText'] = dfa['TweetText'].astype(str)
df1['TweetText'] = df1['TweetText'].astype(str)
df2['TweetText'] = df2['TweetText'].astype(str)
df3['TweetText'] = df3['TweetText'].astype(str)
df4['TweetText'] = df4['TweetText'].astype(str)

In [7]:
# Teach the tokenizer on all available text, then sample out the main dataset
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dfa['TweetText'].tolist())

dfa = equalify(dfa, 100)

In [8]:
one_hot = pd.get_dummies(dfa['Timezone'])
dfa = dfa.join(one_hot)

In [9]:
def make_onehot(df):
    one_hot = pd.get_dummies(df['Closest_State'])
    df = df.join(one_hot)
    #df = df.drop('Closest_State', axis = 1)
    return df

In [10]:
df1 = make_onehot(df1)
df2 = make_onehot(df2)
df3 = make_onehot(df3)
df4 = make_onehot(df4)

In [11]:
x_data = dfa[['TweetText','Timestamp']]
y_data = dfa[dfa['Timezone'].unique()]

x_dat1 = df1[['TweetText','Timestamp']]
y_dat1 = df1[df1['Closest_State'].unique()]

x_dat2 = df2[['TweetText','Timestamp']]
y_dat2 = df2[df2['Closest_State'].unique()]

x_dat3 = df3[['TweetText','Timestamp']]
y_dat3 = df3[df3['Closest_State'].unique()]

x_dat4 = df4[['TweetText','Timestamp']]
y_dat4 = df4[df4['Closest_State'].unique()]

In [12]:
x_traina, x_testa, y_traina, y_testa = train_test_split(x_data, y_data, test_size = 0.25, random_state = 40)
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_dat1, y_dat1, test_size = 0.25, random_state = 40)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_dat2, y_dat2, test_size = 0.25, random_state = 40)
x_train3, x_test3, y_train3, y_test3 = train_test_split(x_dat3, y_dat3, test_size = 0.25, random_state = 40)
x_train4, x_test4, y_train4, y_test4 = train_test_split(x_dat4, y_dat4, test_size = 0.25, random_state = 40)

In [13]:
x_data["TweetText"].str.split(" ").str.len().describe(percentiles=[0.01, 0.5, 0.99])

count    54565.000000
mean        12.062329
std          7.782837
min          1.000000
1%           2.000000
50%         11.000000
99%         28.000000
max        647.000000
Name: TweetText, dtype: float64

In [14]:
tmax_length = 29

In [15]:
def get_seq(x_train, x_test):
    train_sequences = tokenizer.texts_to_sequences(x_train['TweetText'].tolist())
    test_sequences = tokenizer.texts_to_sequences(x_test['TweetText'].tolist())
    
    train_seq = pad_sequences(train_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')
    test_seq = pad_sequences(test_sequences, maxlen = tmax_length, padding = 'post', truncating = 'post')

    return train_seq, test_seq

In [16]:
train_seqa, test_seqa = get_seq(x_traina, x_testa)
train_seq1, test_seq1 = get_seq(x_train1, x_test1)
train_seq2, test_seq2 = get_seq(x_train2, x_test2)
train_seq3, test_seq3 = get_seq(x_train3, x_test3)
train_seq4, test_seq4 = get_seq(x_train4, x_test4)

In [17]:
vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 150

In [18]:
timestamps_traina = x_traina['Timestamp']
timestamps_testa = x_testa['Timestamp']

timestamps_train1 = x_train1['Timestamp']
timestamps_test1 = x_test1['Timestamp']

timestamps_train2 = x_train2['Timestamp']
timestamps_test2 = x_test2['Timestamp']

timestamps_train3 = x_train3['Timestamp']
timestamps_test3 = x_test3['Timestamp']

timestamps_train4 = x_train4['Timestamp']
timestamps_test4 = x_test4['Timestamp']

In [19]:
input_seq = Input(shape = (tmax_length,), name = 'Input Sequence')
input_time = Input(shape = (1,), name = "Input Timestamp")

embed = Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = tmax_length)(input_seq)
convol = Conv1D(filters = 128, kernel_size = 3, activation = 'relu')(embed)
maxpool = GlobalAveragePooling1D()(convol)

concat = Concatenate()([maxpool, input_time])
dense1 = Dense(50, activation = 'relu')(concat)
dense2 = Dense(25, activation = 'relu')(dense1)
dense3 = Dense(15, activation = 'relu')(dense2)
output = Dense(len(dfa['Timezone'].unique()), activation = 'sigmoid')(dense3)

modela = Model(inputs = [input_seq, input_time], outputs = output)
modela.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df1['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model1 = Model(inputs = [input_seq, input_time], outputs = output)
model1.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df2['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model2 = Model(inputs = [input_seq, input_time], outputs = output)
model2.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df3['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model3 = Model(inputs = [input_seq, input_time], outputs = output)
model3.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

output = Dense(len(df4['Closest_State'].unique()), activation = 'sigmoid')(dense3)

model4 = Model(inputs = [input_seq, input_time], outputs = output)
model4.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

#model.summary()

In [20]:
modela.fit([train_seqa, timestamps_traina], y_traina, epochs = 3, batch_size = 100, validation_data = ([test_seqa, timestamps_testa], y_testa))

Epoch 1/3
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 157ms/step - accuracy: 0.3696 - loss: 108.0358 - val_accuracy: 0.5162 - val_loss: 31.8042
Epoch 2/3
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 157ms/step - accuracy: 0.3863 - loss: 39.3043 - val_accuracy: 0.5147 - val_loss: 25.1940
Epoch 3/3
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 157ms/step - accuracy: 0.4030 - loss: 32.2509 - val_accuracy: 0.3367 - val_loss: 39.8081


<keras.src.callbacks.history.History at 0x26923a9e8d0>

In [21]:
model1.fit([train_seq1, timestamps_train1], y_train1, epochs = 3, batch_size = 50, validation_data = ([test_seq1, timestamps_test1], y_test1))

Epoch 1/3
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 156ms/step - accuracy: 0.2631 - loss: 490.3237 - val_accuracy: 0.2680 - val_loss: 1.3889
Epoch 2/3
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 155ms/step - accuracy: 0.2735 - loss: 1.3937 - val_accuracy: 0.2629 - val_loss: 1.3859
Epoch 3/3
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 155ms/step - accuracy: 0.2791 - loss: 1.3847 - val_accuracy: 0.2629 - val_loss: 1.3843


<keras.src.callbacks.history.History at 0x2692948b230>

In [22]:
model2.fit([train_seq2, timestamps_train2], y_train2, epochs = 3, batch_size = 50, validation_data = ([test_seq2, timestamps_test2], y_test2))

Epoch 1/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 156ms/step - accuracy: 0.0839 - loss: 3.0292 - val_accuracy: 0.0830 - val_loss: 2.7148
Epoch 2/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 155ms/step - accuracy: 0.0890 - loss: 2.7029 - val_accuracy: 0.0832 - val_loss: 2.6806
Epoch 3/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 155ms/step - accuracy: 0.0907 - loss: 2.6699 - val_accuracy: 0.0835 - val_loss: 2.6597


<keras.src.callbacks.history.History at 0x2692c2be090>

In [23]:
model3.fit([train_seq3, timestamps_train3], y_train3, epochs = 3, batch_size = 100, validation_data = ([test_seq3, timestamps_test3], y_test3))

Epoch 1/3
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 158ms/step - accuracy: 0.0538 - loss: 3.1285 - val_accuracy: 0.0484 - val_loss: 3.1045
Epoch 2/3
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 157ms/step - accuracy: 0.0543 - loss: 3.1080 - val_accuracy: 0.0469 - val_loss: 3.0858
Epoch 3/3
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 157ms/step - accuracy: 0.0548 - loss: 3.0831 - val_accuracy: 0.0467 - val_loss: 3.0736


<keras.src.callbacks.history.History at 0x2692f53d5b0>

In [24]:
model4.fit([train_seq4, timestamps_train4], y_train4, epochs = 3, batch_size = 50, validation_data = ([test_seq4, timestamps_test4], y_test4))

Epoch 1/3
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 159ms/step - accuracy: 0.3851 - loss: 1.9356 - val_accuracy: 0.4550 - val_loss: 1.9063
Epoch 2/3
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 154ms/step - accuracy: 0.4174 - loss: 2.0989 - val_accuracy: 0.4550 - val_loss: 1.8695
Epoch 3/3
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 155ms/step - accuracy: 0.4620 - loss: 1.8547 - val_accuracy: 0.4550 - val_loss: 1.8355


<keras.src.callbacks.history.History at 0x269267ab230>

In [25]:
def getpred_seq(x_text):
    sequences = tokenizer.texts_to_sequences(x_text['TweetText'].tolist())
    seq = pad_sequences(sequences, maxlen = 29, padding = 'post', truncating = 'post')

    return seq

In [26]:
df_pf =pd.read_csv('All_US_Time_tweets.csv')
# random samples with different state to get different from what was trained on
df_pf = equalify(df_pf, 300)
df_pf['TweetText'] = df_pf['TweetText'].astype(str)

In [27]:
xp_data = df_pf[['TweetText','Timestamp']]
pred_time = xp_data[['Timestamp']]
ypt_data = df_pf[['Timezone']]
yps_data = df_pf[['Closest_State']]

In [28]:
pred_seq = getpred_seq(xp_data)

In [29]:
print(y_traina.columns)
print(y_train1.columns)
print(y_train2.columns)
print(y_train3.columns)
print(y_train4.columns)

Index(['Central', 'Mountain', 'West', 'East'], dtype='object')
Index(['California', 'Nevada', 'Oregon', 'Washington'], dtype='object')
Index(['Alabama', 'Arkansas', 'Illinois', 'Iowa', 'Kansas', 'Louisiana',
       'Minnesota', 'Mississippi', 'Missouri', 'Nebraska', 'North Dakota',
       'Oklahoma', 'South Dakota', 'Tennessee', 'Texas', 'Wisconsin'],
      dtype='object')
Index(['Conneticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
       'Indiana', 'Kentucky', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'New Hampshire', 'New Jersey', 'New York', 'North Carolina', 'Ohio',
       'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina',
       'Vermont', 'Virginia', 'West Virginia'],
      dtype='object')
Index(['Arizona', 'Colorado', 'Idaho', 'Montana', 'New Mexico', 'Utah',
       'Wyoming'],
      dtype='object')


In [30]:
df_pf.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,Region,Timezone,Subtime
0,69662,nowwatching my fav show flip this house,Alabama,Tuskegee,Deep South,Central,South Central
1,25417,my opponent brother is running the elections,Alabama,Huntsville,Deep South,Central,South Central
2,3566,rt what to wear tonight help,Alabama,Birmingham,Deep South,Central,South Central
3,72235,rt why dont people know what to wear when its...,Alabama,Homewood,Deep South,Central,South Central
4,82911,sooo say ure havin a shitty flight huh jusaskin,Alabama,Montgomery,Deep South,Central,South Central


In [31]:
def get_scores(vals):
    tp = vals['TP']
    tn = vals['TN']
    fp = vals['FP']
    fn = vals['FN']

    prec = tp/(tp + fp)
    rec = tp/(tp + fn)

    f1 = 2*((prec*rec)/(prec+rec))

    print('Precision: ' + str(prec))
    print('Recall: ' + str(rec))
    print('F1: ' + str(f1))


In [32]:

run_list = ['North Carolina','Colorado','Wisconsin', 'California']
run_zone = ['East','Mountain','Central','West']

In [33]:
ypt_data

Unnamed: 0,Timezone
0,Central
1,Central
2,Central
3,Central
4,Central
...,...
54560,Mountain
54561,Mountain
54562,Mountain
54563,Mountain


In [34]:
i = 0
imax = df_pf.shape[0]
#imax = i + 50

eval_state = 'North Carolina'
eval_zone = 'East'

tzon = y_traina.columns
west_states = y_train1.columns
cent_states = y_train2.columns
east_states = y_train3.columns
mount_states = y_train4.columns

z_tp = 0
z_tn = 0
z_fp = 0
z_fn = 0

s_tp = 0
s_tn = 0
s_fp = 0
s_fn = 0

while i < imax:
    text = pred_seq[i:i+1]
    time = pred_time['Timestamp'][i:i+1]
    target_zone = ypt_data['Timezone'][i]
    target_state = yps_data['Closest_State'][i]

    time_pred = modela.predict([text,time], verbose = 0)
    zone = tzon[np.argmax(time_pred[0])]
    

    if zone == 'West':
        pred = model1.predict([text,time], verbose = 0)
        state_pred = west_states[np.argmax(pred[0])]

    elif zone == 'Central':
        pred = model2.predict([text,time], verbose = 0)
        state_pred = cent_states[np.argmax(pred[0])]

    elif zone == 'East':
        pred = model3.predict([text,time], verbose = 0)
        state_pred = east_states[np.argmax(pred[0])]

    elif zone == 'Mountain':
        pred = model4.predict([text,time], verbose = 0)
        state_pred = mount_states[np.argmax(pred[0])]

    else: 
        print('Timezone has no prediction')
        i == imax + 1

    #Timezone scoring
    if eval_zone == zone and target_zone == zone:
        z_tp += 1
    elif eval_zone != zone and target_zone == zone:
        z_tn += 1
    elif eval_zone == zone and target_zone != zone:
        z_fp += 1
    elif eval_zone != zone and target_zone != zone:
        z_fn += 1

    # State scoring
    if eval_state == state_pred and target_state == state_pred:
        s_tp += 1
    elif eval_state != state_pred and target_state == state_pred:
        s_tn += 1
    elif eval_state == state_pred and target_state != state_pred:
        s_fp += 1
    elif eval_state != state_pred and target_state != state_pred:
        s_fn += 1

    # print('Zone Eval: ' + eval_zone)
    # print('Zone Actual: ' + target_zone)
    # print('Zone Prediction: ' + zone)
    # print('State Eval: ' + eval_state)
    # print('State Actual: ' + target_state)
    # print('State Prediciton: ' + state_pred)
    # print('')
    if i % 500 == 0:
        clear_output()
        print('Rows processed: ' + str(i) + '/' + str(imax))

    i += 1

zone_perf = {'TP': z_tp, 'TN': z_tn, 'FP': z_fp, 'FN': z_fn}
state_perf = {'TP': s_tp, 'TN': s_tn, 'FP': s_fp, 'FN': s_fn}

Rows processed: 54500/54565


In [35]:
print(zone_perf)
print(state_perf)

{'TP': 32, 'TN': 5497, 'FP': 22, 'FN': 49014}
{'TP': 0, 'TN': 1497, 'FP': 0, 'FN': 53068}


In [37]:
get_scores(zone_perf)

Precision: 0.5925925925925926
Recall: 0.0006524487216082861
F1: 0.0013034623217922606
