In [19]:
import pandas as pd
import numpy as np
import random
from IPython.display import clear_output
import time
import pickle

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Dense, InputLayer, Concatenate
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

import torch
import ast

from transformers import BertTokenizer, TFBertModel


import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


In [38]:
with open('noncont_embeddings.pkl','rb') as file:
    df1 = pickle.load(file)

In [39]:
df1.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,Region
0,46868,"((tf.Tensor(-0.7304522, shape=(), dtype=float3...",Puerto Rico,Vega Alta,NonCont
1,32164,"((tf.Tensor(-0.29484046, shape=(), dtype=float...",Puerto Rico,Vega Alta,NonCont
2,81417,"((tf.Tensor(-0.40089443, shape=(), dtype=float...",Puerto Rico,Vega Alta,NonCont
3,65326,"((tf.Tensor(0.07629065, shape=(), dtype=float3...",Puerto Rico,Vega Alta,NonCont
4,44303,"((tf.Tensor(-0.116008, shape=(), dtype=float32...",Puerto Rico,Vega Alta,NonCont


In [40]:
df1['TweetText'][0]

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-7.30452180e-01,  1.42812163e-01, -2.74206281e-01,
        -4.85327661e-01, -7.75472760e-01,  1.44762307e-01,
         5.96346319e-01,  2.98698217e-01, -7.46789053e-02,
         1.17768615e-01, -2.73642987e-01, -1.61569834e-01,
         4.61329482e-02,  4.24091220e-01,  2.54503757e-01,
         2.39570849e-02,  2.62374222e-01,  3.75470996e-01,
         1.80000573e-01,  5.63000366e-02, -1.94995493e-01,
        -6.60272181e-01, -1.68437883e-01,  2.22740658e-02,
         3.01904559e-01, -1.52624145e-01, -7.42070526e-02,
         7.72434622e-02,  3.94778371e-01, -5.95441982e-02,
         9.14841425e-03,  4.79211450e-01, -2.86592901e-01,
        -4.85453516e-01,  2.31365442e-01,  1.87635332e-01,
         5.47725499e-01, -3.70217532e-01,  1.95502788e-01,
         4.13698971e-01, -3.30030173e-03,  2.16547608e-01,
         3.33371192e-01,  3.46483178e-02, -2.78095603e-01,
        -5.08533180e-01, -3.02663589e+00, -4.00662035e-01,
      

In [41]:
df3 = df1.copy()

In [42]:
# one hot encoding
one_hot = pd.get_dummies(df3['Region'])
df3 = df3.join(one_hot)
df3 = df3.drop('Region', axis = 1)
df3

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,NonCont
0,46868,"((tf.Tensor(-0.7304522, shape=(), dtype=float3...",Puerto Rico,Vega Alta,True
1,32164,"((tf.Tensor(-0.29484046, shape=(), dtype=float...",Puerto Rico,Vega Alta,True
2,81417,"((tf.Tensor(-0.40089443, shape=(), dtype=float...",Puerto Rico,Vega Alta,True
3,65326,"((tf.Tensor(0.07629065, shape=(), dtype=float3...",Puerto Rico,Vega Alta,True
4,44303,"((tf.Tensor(-0.116008, shape=(), dtype=float32...",Puerto Rico,Vega Alta,True
...,...,...,...,...,...
343,70464,"((tf.Tensor(-0.3734281, shape=(), dtype=float3...",Puerto Rico,Vega Alta,True
344,56806,"((tf.Tensor(-0.7381308, shape=(), dtype=float3...",Puerto Rico,Vega Alta,True
345,32236,"((tf.Tensor(-0.16844225, shape=(), dtype=float...",Puerto Rico,Vega Alta,True
346,36948,"((tf.Tensor(-0.5607163, shape=(), dtype=float3...",Puerto Rico,Vega Alta,True


In [43]:
x_dat = df3[['TweetText']]
y_dat = df3[df1['Region'].unique()]
timestamps = df3['Timestamp']

In [44]:
df3['TweetText'][0].shape

TensorShape([1, 768])

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x_dat, y_dat, test_size = 0.3, random_state = 50)

In [46]:
x_train

Unnamed: 0,TweetText
114,"((tf.Tensor(-0.32298625, shape=(), dtype=float..."
39,"((tf.Tensor(0.09829318, shape=(), dtype=float3..."
309,"((tf.Tensor(-0.2519303, shape=(), dtype=float3..."
162,"((tf.Tensor(-0.87546563, shape=(), dtype=float..."
242,"((tf.Tensor(-0.32779455, shape=(), dtype=float..."
...,...
70,"((tf.Tensor(0.16876903, shape=(), dtype=float3..."
132,"((tf.Tensor(-0.5527697, shape=(), dtype=float3..."
291,"((tf.Tensor(-0.59243524, shape=(), dtype=float..."
109,"((tf.Tensor(-0.8179846, shape=(), dtype=float3..."


In [47]:
y_train['NonCont'][0]

True

In [48]:
type(x_train['TweetText'][0])

tensorflow.python.framework.ops.EagerTensor

In [49]:
# Input layers
bert_input = Input(shape=(27,768), name='bert_input')
#timestamp_input = Input(shape=(1,), name='timestamp_input')

#input_layer = InputLayer(input_shape = (27,768), name = 'input_layer')

# Convolutional layer for BERT embeddings
conv_layer = Conv1D(30, kernel_size=3, strides=2, activation='relu')(bert_input)
pool_layer = GlobalMaxPooling1D()(conv_layer)

# Concatenate the timestamp with the convolutional output
#concatenated = Concatenate()([pool_layer, timestamp_input])

# Fully connected layers
dense1 = Dense(20, activation='relu')(pool_layer)
dense2 = Dense(15, activation='relu')(dense1)
output = Dense(7, activation='softmax')(dense2)

# Define the model
#rmodel = Model(inputs=[bert_input, timestamp_input], outputs=output)
rmodel = Model(inputs=bert_input, outputs=output)

# Compile the model
rmodel.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
rmodel.summary()


In [50]:
rmodel.fit(x_dat, y_dat, epochs=10, batch_size=32)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type tensorflow.python.framework.ops.EagerTensor).

rmodel = Sequential()
rmodel.add(InputLayer(input_shape = (27,768)))
rmodel.add(GlobalMaxPooling1D())
rmodel.add(Flatten())
rmodel.add(Dense(20, activation='relu'))
rmodel.add(Dense(15, activation='relu'))
rmodel.add(Dense(7, activation='softmax'))


rmodel.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
rmodel.summary()


rmodel.fit(x_dat, y_dat, epochs = 5, batch_size = 32)

In [51]:
print(tf.keras.__version__)

3.6.0
