In [1]:
import pandas as pd
import numpy as np
import time
import random
from IPython.display import clear_output
import pickle

from transformers import BertTokenizer, TFBertModel




In [2]:
df = pd.read_csv('All_US_Tweets.csv')

In [3]:
df.columns

Index(['Timestamp', 'TweetText', 'Closest_State', 'Closest_City', 'Region'], dtype='object')

In [4]:
df['Region'].unique()

array(['South', 'Southwest', 'West Coast', 'Rockies', 'Northeast',
       'Midwest', 'NonCont'], dtype=object)

In [5]:
West_Coast = ('California','Oregon', 'Washington')
Southwest = ('Arizona','New Mexico','Oklahoma','Texas')
Rockies = ('Nevada','Utah','Colorado','Wyoming','Idaho','Montana')
Midwest = ('North Dakota','South Dakota','Nebraska','Kansas','Missouri','Iowa','Minnesota','Wisconsin','Illinois',
          'Indiana','Michigan','Ohio')
South = ('Arkansas','Louisiana','Mississippi','Tennessee','Kentucky','Alabama','Georgia','Florida','South Carolina','North Carolina',
        'Virginia','West Virginia','Maryland','Delaware', 'District of Columbia')
Northeast = ('Pennsylvania','New Jersey','New York','Massachusetts', 'Rhode Island','Conneticut','Vermont','New Hampshire','Maine')
NonCont = ('Hawaii','Alaska', 'Puerto Rico')

In [6]:
ROI = NonCont
Cap_Per_State = 500
out_df = pd.DataFrame()

In [7]:
dfoi = df[df['Closest_State'].isin(ROI)]

In [8]:
for state in ROI:
    df_s = dfoi[dfoi['Closest_State'] == state]
    
    if len(df_s) < Cap_Per_State:
        sample_state = df_s.sample(n = len(df_s), random_state = 100)
    else:
        sample_state = df_s.sample(n = Cap_Per_State, random_state = 100)
    
    out_df = pd.concat([out_df, sample_state], ignore_index = True)

In [9]:
out_df['TweetText'] = out_df['TweetText'].astype(str)
out_df = out_df[out_df['TweetText'].str.split(" ").str.len() <= 29]

In [10]:
# Bert tools

b_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def tokenize_tweet(tweet):
    return b_tokenizer(tweet, padding=True, truncation=True, return_tensors="tf")

#tokenized_tweets = df['tweet_text'].apply(tokenize_tweet)

def get_bert_embeddings(tokenized_tweet):
    outputs = bert_model(tokenized_tweet)
    return outputs.last_hidden_state[:, 0, :]
    #return outputs[0]

#embeddings = tokenized_tweets.apply(get_bert_embeddings)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [11]:
out_df['TweetText'] = out_df['TweetText'].apply(tokenize_tweet)

In [12]:
start = time.time()
out_df['TweetText'] = out_df['TweetText'].apply(get_bert_embeddings)
end = time.time()

In [13]:
total_time = end - start
hours = total_time // 3600
mins = total_time % 3600 // 60
sec = total_time % 3600 % 60

print("Execution took: " + str(hours) + ' hrs, ' + str(mins) + ' min, ' + str(np.round(sec,2)) + 'secs')

Execution took: 0.0 hrs, 1.0 min, 10.5secs


In [14]:
out_df.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City,Region
0,46868,"((tf.Tensor(-0.7304522, shape=(), dtype=float3...",Puerto Rico,Vega Alta,NonCont
1,32164,"((tf.Tensor(-0.29484046, shape=(), dtype=float...",Puerto Rico,Vega Alta,NonCont
2,81417,"((tf.Tensor(-0.40089443, shape=(), dtype=float...",Puerto Rico,Vega Alta,NonCont
3,65326,"((tf.Tensor(0.07629065, shape=(), dtype=float3...",Puerto Rico,Vega Alta,NonCont
4,44303,"((tf.Tensor(-0.116008, shape=(), dtype=float32...",Puerto Rico,Vega Alta,NonCont


In [15]:
type(out_df['TweetText'][0])

tensorflow.python.framework.ops.EagerTensor

In [16]:
with open('noncont_embeddings.pkl', 'wb') as file:
    pickle.dump(out_df, file)