In [None]:
# Installing requirements
! pip3 install transformers
! pwd

In [2]:
! pwd

/Users/mfaisal/Documents/22_BackUp_BU/Colab Notebooks/ai-cs640/project/src


In [45]:
# Import and imporatan initializations
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
# import torch
# import transformers
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier, VotingClassifier


import sys
import json
import os
import re

def_random_seed = 42
np.random.seed(def_random_seed) # for reproducibility


# profiles_file_path = '/home/demographicPrediction/User_demo_profiles.json'

# Part one: declare assisting functions to load data.

In [46]:
# remove links http or https
# remove usernames @User_Name92378
def removeLinksUserNames(x):
  regexMap={
      r"http\w*:\/\/[\w*\.*\/*]*\s": "", 
      r"[\w*:\/]+\.[\.*\w*\/*\~*]+\s+": "",
      r"@[\w_]+\s": ""
      }
  for regx in regexMap.keys():
    x = re.sub(regx, regexMap[regx], x)
  return x.lower()

In [47]:
# This file is a clean dataset that has the following:
# 290 user of age <= 21
# 290 user of age > 21
# Only english language
# ID - age - binary_age
def getAgeData(train_size = 500, max_tweets_per_example = 20, tokenizer = None, get_all = False):
  # age_labels_file_path = '/content/data/age_reduced_labeled_users.csv'
  # age_tweets_file_path = '/content/data/age_tweets.json'

  age_labels_file_path = 'data/age_reduced_labeled_users.csv'
  # age_labels_file_path = 'data/age_labeled_users.csv'
  age_tweets_file_path = 'data/tweets.json'



  np.random.seed(def_random_seed)

  # Labels
  age_Y = np.genfromtxt(age_labels_file_path, delimiter=',', dtype=object)
  np.random.shuffle(age_Y)
  age_Y_train = age_Y[0:train_size, :]
  age_Y_test = age_Y[train_size:, :]

  # Tweets
  with open(age_tweets_file_path) as tweets_file:
    tweets = json.load(tweets_file)

  age_X_train = []
  age_Y_train_ = []
  age_X_test = []
  age_Y_test_ = []

  for user in age_Y_train:
    user_name = user[0].decode("utf-8")
    i = 0
    while (get_all and i < len(tweets[user_name])) or (i < int(len(tweets[user_name]) - max_tweets_per_example * 0.75)):
      age_X_train.append([removeLinksUserNames(tweet) for tweet in tweets[user_name][i:i+max_tweets_per_example]])
      age_Y_train_.append(user[2])
      i += max_tweets_per_example
  
  for user in age_Y_test:
    user_name = user[0].decode("utf-8")
    i = 0
    while (get_all and i < len(tweets[user_name])) or (i < int(len(tweets[user_name]) - max_tweets_per_example * 0.75)):
      age_X_test.append([removeLinksUserNames(tweet) for tweet in tweets[user_name][i:i+max_tweets_per_example]])
      age_Y_test_.append(user[2])
      i += max_tweets_per_example

  if tokenizer != None:
    age_X_train = [[tokenizer.encode_plus(tweet, return_tensors="pt") for tweet in user_tweets ] for user_tweets in age_X_train]
    age_X_test = [[tokenizer.encode_plus(tweet, return_tensors="pt") for tweet in user_tweets ] for user_tweets in age_X_test]
  
  age_Y_train_ = np.array(age_Y_train_).astype(np.float64).astype(np.int16)
  age_Y_test_ = np.array(age_Y_test_).astype(np.float64).astype(np.int16)
  
  return age_X_train, age_Y_train_, age_X_test, age_Y_test_

In [5]:
# Get tweets embeddings
def getTweetsEmbeddings(X, max_tweets = 10, model = None):
  if model == None:
    model = transformers.AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)
  
  layers = [-4,-3,-2,-1]

  embedded_X = []

  for i in range(len(X)):
    user_tweets = X[i]

    tweet_embedding = []
    for j in range(len(user_tweets)):
      if j == max_tweets:
        break;
      tweet = user_tweets[j]

      # Encode tweet
      with torch.no_grad():
        output = model(**tweet)
      states = output.hidden_states
      embedding = torch.stack([states[i] for i in layers]).sum(0).squeeze().numpy()
      tweet_embedding.append(embedding)
    
    embedded_X.append([embedding for tweet_embedding in tweet_embedding for embedding in tweet_embedding])
      
  return embedded_X

In [6]:
# Reduce tweet embeddings
def projectEmbeddingPCA(x, x_m, x_u):
  x_ = np.matmul(x - x_m, x_u)
  return x_

def projectEmbeddings(X, word_size = 64, x_m = None, x_u = None, first_time = True):
  if first_time == True:
    # flatten embeddings
    X_flattened = np.array([tweet for tweets in X for tweet in tweets])
    
    # Get mean
    x_m = sum(X_flattened) / len(X_flattened)

    # Normalize and covariance
    X_n = X_flattened - x_m
    X_c = np.matmul(np.transpose(X_n), X_n) / len(X_n)

    # Get word_matrix
    x_u, x_s, x_Vh = np.linalg.svd(X_c)
    x_u_ = x_u[:, :word_size]
  else:
    x_u_ = x_u

  X_ = [projectEmbeddingPCA(x, x_m, x_u_) for x in X]
  return X_, x_m, x_u_

def reduceEmbeddingPCA(X, words_num = 5):
  X_t = np.array(X).T
  
  # Get the embedding mean
  X_m =  sum(X_t) / len(X_t)

  # Normalize and covariance
  X_n = X_t - X_m
  X_c = np.matmul(np.transpose(X_n), X_n) / len(X_n)

  u, s, Vh = np.linalg.svd(X_c)
  u_ = u[:, :words_num]

  X_ = np.zeros((words_num, X.shape[1]))
  X__ = np.matmul(X_t, u_).T
  X_[0:X__.shape[1]] = X__

  return X_


def reduceEmbeddings_(X, words_num = 5):
  X_ = np.empty((len(X), words_num, len(X[0][0])))

  for i in range(len(X_)):
    X_[i] = reduceEmbeddingPCA(X[i], words_num)

  return X_

def reduceEmbeddings(X, word_size = 64, words_num = 5, x_m = None, x_u = None, first_time = True):
  X_, x_m, x_u = projectEmbeddings(X, word_size, x_m, x_u)
  X_ = reduceEmbeddings_(X_, words_num).reshape(len(X), words_num * word_size)
  return X_, x_m, x_u


# Part Two: explores the use of BERT-based encodings

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
age_X_train, age_Y_train, age_X_test, age_Y_test = getAgeData(500, 15, tokenizer)

# 48155 (500,) 7638 (80,)
print(len(age_X_train), age_Y_train.shape, len(age_X_test), age_Y_test.shape)

2890 (2890,) 457 (457,)


In [7]:
age_X_train_embeddings_300_15 = getTweetsEmbeddings(age_X_train[0:300], 15)
age_X_test_embeddings_300_15 = getTweetsEmbeddings(age_X_test[0:300], 15)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predicti

In [25]:
word_size = 16
words_num = 15

age_X_train_embeddings_reduced, x_m, x_u = reduceEmbeddings(age_X_train_embeddings_300_15, word_size, words_num)
age_X_test_embeddings_reduced, _, _ = reduceEmbeddings(age_X_test_embeddings_300_15, word_size, words_num, x_m, x_u, False)

In [26]:
# Linear Logistic Regression
# Train some model
model = LogisticRegression(max_iter=10000, warm_start = True)
scores = cross_val_score(model, age_X_train_embeddings_reduced, age_Y_train[0:300], cv=5, scoring='f1_macro')
print(scores)
print(np.mean(scores))

[0.5959596  0.64912281 0.53333333 0.54685315 0.64517037]
0.5940878515412005


In [27]:
# Test the model
model.fit(age_X_train_embeddings_reduced, age_Y_train[0:300])
y_pred = model.predict(age_X_test_embeddings_reduced)
y_props = model.predict_proba(age_X_test_embeddings_reduced)


print(classification_report(y_true=age_Y_test[0:300], y_pred=y_pred))
print(confusion_matrix(y_true=age_Y_test[0:300], y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.45      0.39      0.42       160
           1       0.39      0.45      0.42       140

    accuracy                           0.42       300
   macro avg       0.42      0.42      0.42       300
weighted avg       0.42      0.42      0.42       300

[[63 97]
 [77 63]]


In [None]:
# Probalilities show overfitting
print(y_props)

# Part Three: try models based on CountVectorizer and TF/IDF encodings:

In [48]:
# Step one: Preprocess the data
# Count Vectorized and Naive Bayes
age_X_train, age_Y_train, age_X_test, age_Y_test = getAgeData(500, 200, None, True)
print(len(age_X_train), age_Y_train.shape, len(age_X_test), age_Y_test.shape)

# Flaten age_X_train
X_train_flat = []
for tweets in age_X_train:
  tweet_augmented = ""
  for tweet in tweets:
    tweet_augmented += tweet
  X_train_flat.append(tweet_augmented)

X_test_flat = []
for tweets in age_X_test:
  tweet_augmented = ""
  for tweet in tweets:
    tweet_augmented += tweet
  X_test_flat.append(tweet_augmented)

# vectorizer = CountVectorizer(stop_words='english')
# vectorizer.fit(X_train_flat)

# age_X_train_ = vectorizer.transform(X_train_flat)
# age_X_test_ = vectorizer.transform(X_test_flat)

500 (500,) 80 (80,)


In [53]:
# Step two: trying the different following architectures.
# Train Model
# ['identity', 'logistic', 'relu', 'softmax', 'tanh']
# (16,4,1) >> 68%
model_nb = Pipeline([
                ('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()),
                # ('standardscaler', preprocessing.StandardScaler(with_mean=False)),
                # ('pca', TruncatedSVD(4096)),
                # ('mlpc', MLPClassifier(hidden_layer_sizes=(16,4,1), max_iter=10000,activation = 'logistic', solver='adam',random_state=123)),
                # ('clf1', MultinomialNB()),
                # ('clf2', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                # ('clf3', LogisticRegression(n_jobs=1, C=1e5)),
                # ("classifiers", RandomForestClassifier()),
                ("votingClassifier", VotingClassifier(
                    estimators=[
                        ('mlpc', MLPClassifier(hidden_layer_sizes=(16,4,1), max_iter=10000, activation = 'logistic', solver='adam',random_state=123)),
                        ('clf1', MultinomialNB()),
                        # ('clf2', SGDClassifier(penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                        ('clf3', LogisticRegression(n_jobs=1, C=1e5)),
                        ("classifiers", RandomForestClassifier()),
                        ],
                    voting='soft',
                    weights=[
                        1,
                        1,
                        # 1,
                        1,
                        1,
                    ],
                    flatten_transform=True)
                ),
              ])
scores = cross_val_score(model_nb, X_train_flat, age_Y_train, cv=5, scoring='f1_macro')
print(scores)
print(np.mean(scores))



[0.67532468 0.6068152  0.69987995 0.61444805 0.69951923]
0.6591974226336458




In [None]:
# Step three: test the chosen model
model_nb.fit(X_train_flat, age_Y_train)
y_pred_class = model_nb.predict(X_test_flat)

print("Classification Report")
print(classification_report(y_true=age_Y_test,y_pred=y_pred_class))
print("Confusion matrix:")
print(confusion_matrix(y_true=age_Y_test,y_pred=y_pred_class))

# Part four : early attempts code and now not used anymore.

In [None]:
# First step: read user labels and filter users
# Users data
# Header:   user_id  is_female  year_born  race

# Read Labels
labels_raw = pd.read_csv(labels_file_path)

# # Filters
id_filter = (labels_raw.user_id >= 0) & (labels_raw.user_id <= 9999999999)
gender_filter = (labels_raw.is_female == 0) | (labels_raw.is_female == 1)
year_filter = (labels_raw.year_born > 1900) & (labels_raw.year_born < 2020)
race_filter = (labels_raw.race >= 0) & (labels_raw.race < 5)
all_filter = id_filter & gender_filter & year_filter & race_filter

# Filtered data 
labels_filtered = labels_raw[all_filter]


# my_data = np.transpose(my_data)
# gender_dist = np.unique(my_data[1], return_counts= True)
# year_dist = np.unique(my_data[2], return_counts= True)
# race_dist = np.unique(my_data[3], return_counts= True)


# print(my_data.shape)
# print(labels_raw)
print(labels_filtered[0:10])
# print(gender_dist)
# print(year_dist)
# print(race_dist)


      user_id  is_female  year_born  race
0     12488.0        0.0     1980.0   4.0
1    719703.0        0.0     1985.0   4.0
2    722153.0        1.0     1973.0   3.0
4    755531.0        0.0     1982.0   4.0
5    811618.0        0.0     1987.0   3.0
6    822540.0        0.0     1979.0   4.0
7    865071.0        0.0     1995.0   4.0
8    988211.0        0.0     1965.0   4.0
9   1025311.0        1.0     1955.0   4.0
10  1143891.0        0.0     1976.0   3.0


In [None]:
# Step 2: read tweets and arrange it in pair list
with open(tweets_file_path) as json_data:
    data = json.load(json_data)

raw_tweets = []
for k in data:
    for v in data[k]:
        raw_tweets.append([k,v])

print(len(data))
print(len(raw_tweets))
print(raw_tweets[0:2])

3276
313473
[['12488', 'YKAR, a futuristic sans serif font by @EmmeranR - #Freebie #Font #Merci https://t.co/b6fBDvz6yZ'], ['12488', '@MBonvoyAssist Who can I contact about the very rude and poor service I’m experiencing during my current stay? Please and thank you!']]


In [None]:
# Step 3: choose the tweets that has choosen users and create (tweet - is_female - year_born - race) array
tweets_X = []
tweets_Y = []

for tweet in raw_tweets:
    user_info = labels_filtered[labels_filtered.user_id == float(tweet[0])].to_numpy()
    if len(user_info) > 0:
        tweets_X.append(tweet[1])
        tweets_Y.append(user_info[0,1:4].astype(int))

tweets_X = np.array(tweets_X)
tweets_Y = np.array(tweets_Y)

print(tweets_X.shape)
print(tweets_X[0:1])
print(tweets_Y[0:1])

(292114,)
['YKAR, a futuristic sans serif font by @EmmeranR - #Freebie #Font #Merci https://t.co/b6fBDvz6yZ']
[[   0 1980    4]]


In [None]:
# Step 4: choose random train, test, validation sets

# Train
train_indices = np.random.choice(tweets_X.shape[0], 140000, replace=False)
tweets_X_train = tweets_X[train_indices]
tweets_Y_train = tweets_Y[train_indices]

# Validation
validation_indices = [x for x in np.random.choice(tweets_X.shape[0], 40000, replace=False) if x not in train_indices]
tweets_X_val = tweets_X[validation_indices]
tweets_Y_val = tweets_Y[validation_indices]

# Test
test_indices = [x for x in np.random.choice(tweets_X.shape[0], 80000, replace=False) if x not in train_indices and x not in validation_indices]
tweets_X_test = tweets_X[test_indices]
tweets_Y_test = tweets_Y[test_indices]

# save data 
np.savetxt('/home/demographicPrediction/tweets_X_train.csv', tweets_X_train, delimiter=",", fmt='%s')
np.savetxt('/home/demographicPrediction/tweets_Y_train.csv', tweets_Y_train, delimiter=",")

np.savetxt('/home/demographicPrediction/tweets_X_val.csv', tweets_X_val, delimiter=",", fmt='%s')
np.savetxt('/home/demographicPrediction/tweets_Y_val.csv', tweets_Y_val, delimiter=",")

np.savetxt('/home/demographicPrediction/tweets_X_test.csv', tweets_X_test, delimiter=",", fmt='%s')
np.savetxt('/home/demographicPrediction/tweets_Y_test.csv', tweets_Y_test, delimiter=",")

print(tweets_Y_train.shape[0])
print(tweets_Y_val.shape[0])
print(tweets_Y_test.shape[0])

In [None]:
# Step 4 (alt to 1-4): load chosen data
tweets_X_train = genfromtxt('/home/demographicPrediction/tweets_X_train.csv', delimiter='NOOOO DEL AT ALLL', dtype= np.str_)[0:100]
tweets_Y_train = genfromtxt('/home/demographicPrediction/tweets_Y_train.csv', delimiter=',', dtype= np.int16)[0:100]

tweets_X_val = genfromtxt('/home/demographicPrediction/tweets_X_val.csv', delimiter='NOOOO DEL AT ALLL', dtype= np.str_)[0:100]
tweets_Y_val = genfromtxt('/home/demographicPrediction/tweets_Y_val.csv', delimiter=',', dtype= np.int16)[0:100]

tweets_X_test = genfromtxt('/home/demographicPrediction/tweets_X_test.csv', delimiter='NOOOO DEL AT ALLL', dtype= np.str_)[0:100]
tweets_Y_test = genfromtxt('/home/demographicPrediction/tweets_Y_test.csv', delimiter=',', dtype= np.int16)[0:100]

print(tweets_Y_train.shape[0])
print(tweets_Y_val.shape[0])
print(tweets_Y_test.shape[0])

100
100
100


In [None]:
# Uses BERT to get words embeddings; then using PCA
# Reduces words embeddings to size of "word_embedding_size"
# Reduces tweet embeddings to size of "tweet_embedding_size"
# Returns projected X, PCA matrix for words, PCA matrix tweets
# Note: doing PCA on the current dataset is similar to
# fine tunning the BERT model
# TODO: get n words per tweet
def getTweetsBertEmbeddings(X, word_embedding_size = 64, tweet_embedding_size = 4, word_matrix = None, word_embedding_mean = None):
  # Use last four layers by default
  layers = [-4, -3, -2, -1]

  # Getting tokenizer and pretrained model
  tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
  model = transformers.AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)

  # Getting encodings for a tweets
  encoded_tweets = [tokenizer.encode_plus(tweet, return_tensors="pt") for tweet in X]

  # Run model on encoded tweets
  tweets_embeddings = []                  # 3D matrix (tweet_num, tweet_vectors, word_vector)
  word_embeddings_num = 0
  for encoded_tweet in encoded_tweets:
    with torch.no_grad():
        output = model(**encoded_tweet)
    states = output.hidden_states
    embedding = torch.stack([states[i] for i in layers]).sum(0).squeeze().numpy()
    tweets_embeddings.append(embedding)
    word_embeddings_num+= len(embedding)

  print(np.array(tweets_embeddings[0]).shape)
  print(np.array(tweets_embeddings).shape)
  print(np.array(np.array(tweets_embeddings).ravel()).shape)
  # First time call using training data
  if word_matrix == None or word_embedding_mean == None:
    # Flat all words embeddings through all tweets
    t_N = len(tweets_embeddings)
    t_words_N = word_embeddings_num
    t_word_size = len(tweets_embeddings[0][0])
    flattened_embeddings = np.array(tweets_embeddings).reshape(t_words_N, t_word_size)

    # do PCA and choose best word_embedding_size
    # First normalize data
    word_embedding_mean = sum(flattened_embeddings) / flattened_embeddings.shape[0]
    flattened_embeddings_normalized = flattened_embeddings - word_embedding_mean
    flattened_embeddings_c = np.matmul(np.transpose(flattened_embeddings_normalized), flattened_embeddings_normalized) / flattened_embeddings_normalized.shape[0]

    # Get word vector
    u, s, Vh = np.linalg.svd(flattened_embeddings_c)
    word_matrix = u[:, :word_embedding_size]

    # For each tweet and for each word inside the tweet,
    # do the transformation into the new word space
    tweets_new_embeddings = np.array([np.matmul(tweet, word_matrix) for tweet in tweets_embeddings])

    # For each tweet, each best tweet_embedding_size word vectors
    # using pca and then get to represent the new tweet using it.
    reduced_tweets = np.zeros((len(tweets_new_embeddings), tweet_embedding_size, word_embedding_size))
    for ind in range(len(tweets_new_embeddings)):
      tweet = tweets_new_embeddings[ind]
      tweet_c = np.matmul(tweet, np.transpose(tweet)) / tweet.shape[1]
      u_, s_, Vh_ = np.linalg.svd(tweet_c)
      tweet_matrix = u_[:, :tweet_embedding_size]           # old_words_num * new_words_num
      new_tweet = np.matmul(tweet.T, tweet_matrix).T
      reduced_tweets[ind] = new_tweet

    return reduced_tweets, word_matrix, word_embedding_mean

  else:
    tweets_new_embeddings = np.array([np.matmul(tweet - word_embedding_mean, word_matrix) for tweet in tweets_embeddings])
    
    # copied from above
    # For each tweet, each best tweet_embedding_size word vectors
    # using pca and then get to represent the new tweet using it.
    reduced_tweets = np.zeros((len(tweets_new_embeddings), tweet_embedding_size, word_embedding_size))
    for ind in range(len(tweets_new_embeddings)):
      tweet = tweets_new_embeddings[ind]
      tweet_c = np.matmul(tweet, np.transpose(tweet)) / tweet.shape[1]
      u_, s_, Vh_ = np.linalg.svd(tweet_c)
      tweet_matrix = u_[:, :tweet_embedding_size]           # old_words_num * new_words_num
      new_tweet = np.matmul(tweet.T, tweet_matrix).T
      reduced_tweets[ind] = new_tweet

    return reduced_tweets
    

In [None]:
# Step 5: Get Reduced tweets 
word_size = 64
word_num = 5
reduced_tweets_train, word_u, word_m = getTweetsBertEmbeddings(tweets_X_train, word_size, word_num)
reduced_tweets_val = getTweetsBertEmbeddings(tweets_X_val, word_size, word_num, word_u, word_m)
reduced_tweets_test = getTweetsBertEmbeddings(tweets_X_test, word_size, word_num, word_u, word_m)

print(reduced_tweets_train.shape)
print(reduced_tweets_val.shape)
print(reduced_tweets_test.shape)

In [None]:
# Step 6: Add metrics functions

In [None]:
# Step 7: Build the model

In [None]:
# Step 8: Evaluate the model
#   - Train the model
#   - Track 

In [None]:
# Now tokenize each tweet
import nltk
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')

tweets_X_tokenized = [nltk.tokenize.word_tokenize(t) for t in tweets_X]

print(tweets_X_tokenized[0:5])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[['YKAR', ',', 'a', 'futuristic', 'sans', 'serif', 'font', 'by', '@', 'EmmeranR', '-', '#', 'Freebie', '#', 'Font', '#', 'Merci', 'https', ':', '//t.co/b6fBDvz6yZ'], ['@', 'MBonvoyAssist', 'Who', 'can', 'I', 'contact', 'about', 'the', 'very', 'rude', 'and', 'poor', 'service', 'I', '’', 'm', 'experiencing', 'during', 'my', 'current', 'stay', '?', 'Please', 'and', 'thank', 'you', '!'], ['@', 'SSlnes', 'I', '’', 'd', 'like', 'to', 'win', '!'], ['@', 'LuckyDucksNFT', '@', 'Rydog'], ['Now', 'I', "'m", 'heading', 'to', 'B1000th', 'Floor', '!', '#', 'quickrogue']]


In [None]:
import numpy as np
import torch
import transformers
import os

def get_word_idx(sent: str, word: str):
    return sent.split(" ").index(word)

def get_hidden_states(encoded, token_ids_word, model, layers):
    """Push input IDs through model. Stack and sum `layers` (last four by default).
    Select only those subword token outputs that belong to our word of interest
    and average them."""
    with torch.no_grad():
        output = model(**encoded)

    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
    # Only select the tokens that constitute the requested word
    word_tokens_output = output[token_ids_word]

    return word_tokens_output.mean(dim=0)

def get_word_vector(sent , idx, tokenizer, model, layers):
    """Get a word vector by first tokenizing the input sentence, getting all token idxs
    that make up the word of interest, and then `get_hidden_states`."""
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, token_ids_word, model, layers)

In [None]:

# Getting vectors for a sentence
sent = "I like cookies ." 
encoded_sent = tokenizer.encode_plus(sent, return_tensors="pt")
print(tokenized_sent)

# run model on encoded sentence
with torch.no_grad():
    output = model(**encoded_sent)
states = output.hidden_states
output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
print(len(output[0]))
print(output)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[  101,   146,  1176, 18621,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
768
tensor([[ 3.6612,  0.7498, -1.2082,  ..., -0.7623,  1.1960,  0.3004],
        [ 3.2393, -0.3290,  3.4289,  ...,  0.1825, -1.5243,  2.5883],
        [ 0.6567, -0.9944, -2.1942,  ...,  2.2502, -3.6377,  1.8441],
        [-0.5503,  0.1819,  4.4018,  ...,  1.7883,  0.3557,  3.4723],
        [ 2.3956, -1.1617,  0.7915,  ...,  1.0481,  0.5790, -0.1141],
        [ 0.7174,  0.3721, -0.0403,  ..., -0.0444,  0.9401, -0.4450]])
