In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv1D, LSTM, Bidirectional, GRU, SimpleRNN
import time
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
data = pd.read_csv('001_2802_merged_12000.csv')

# Data preprocessing
X = data['text']
y = data['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
print(X_train)
print(X_train_tfidf)

7774                            everything was reasonable
6743                   and right now she can't answer any
1467     how easy is it for you to get a good night sleep
2927     I just didn't feel well I just couldn't I jus...
6192        therapist that she talked back oh my gosh wow
                              ...                        
5226     I like when my kids now my oldest kid you kno...
5390     I'm getting a little old to be thinking of a ...
860                            I haven't been back as yet
7603     I still wouldn't wish it on somebody else com...
7270     and I was able to handle two of them but the ...
Name: text, Length: 6405, dtype: object
  (0, 3431)	0.8125335841688619
  (0, 4583)	0.290300966278865
  (0, 1521)	0.5054844444443967
  (1, 280)	0.42947387022426725
  (1, 271)	0.5456096350580723
  (1, 673)	0.30720893451566034
  (1, 3763)	0.3726050765823578
  (1, 2907)	0.3408528865335552
  (1, 3574)	0.37062050994998025
  (1, 253)	0.17638082586084694
  (2, 3839)	0

In [21]:
##Keras tokenizer

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
import time

# Create directory to save figures if it doesn't exist
output_dir = "keras.preprocessing_text_Tokenizer_ML"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load dataset
data = pd.read_csv('001_2802_merged_12000.csv')

# Data preprocessing
X = data['text']
y = data['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use Tokenizer for text preprocessing
max_words = 5000  # Maximum number of words to keep, based on word frequency
max_len = 100  # Maximum length of the sequences (after padding)

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [23]:
print("X_train: ", X_train)
print("X_train_seq_token:", X_train_seq)
print("X_train_pad: ", X_train_pad)

X_train:  7774                            everything was reasonable
6743                   and right now she can't answer any
1467     how easy is it for you to get a good night sleep
2927     I just didn't feel well I just couldn't I jus...
6192        therapist that she talked back oh my gosh wow
                              ...                        
5226     I like when my kids now my oldest kid you kno...
5390     I'm getting a little old to be thinking of a ...
860                            I haven't been back as yet
7603     I still wouldn't wish it on somebody else com...
7270     and I was able to handle two of them but the ...
Name: text, Length: 6405, dtype: object
X_train_seq_token: [[163, 12, 1572], [3, 86, 64, 68, 135, 435, 159], [67, 144, 28, 11, 26, 9, 4, 32, 5, 41, 176, 105], [2, 14, 87, 78, 63, 2, 14, 254, 2, 14, 87, 78, 86, 2, 254, 693, 76, 40, 8, 11], [260, 7, 68, 777, 81, 160, 10, 728, 328], [31, 7, 182], [19, 9, 169, 42, 347, 24, 425], [66, 8, 10, 2351, 3, 5, 3

In [15]:
### BERT_single_vs_BERT_NN
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, LSTM, GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, SimpleRNN
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertModel
import time

# Enable mixed precision
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Create output directory for plots
output_dir = '004_BERT_tokenizer_NN'
os.makedirs(output_dir, exist_ok=True)

# Load dataset
data = pd.read_csv('001_2802_merged_12000.csv')

# Data preprocessing
X = data['text']
y = data['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode sequences using BERT tokenizer with adjusted max_length
def tokenize_and_encode(texts, max_len=64):  # Set max_length to 64
    inputs = tokenizer(
        texts.tolist(),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    return inputs

# Re-encode with the new max_length
X_train_encoded = tokenize_and_encode(X_train, max_len=64)
X_test_encoded = tokenize_and_encode(X_test, max_len=64)
# Generate embeddings using BERT
def generate_embeddings(bert_model, encoded_inputs):
    outputs = bert_model(encoded_inputs['input_ids'], attention_mask=encoded_inputs['attention_mask'])
    return outputs.last_hidden_state

X_train_emb = generate_embeddings(bert_model, X_train_encoded)
X_test_emb = generate_embeddings(bert_model, X_test_encoded)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [16]:
print(X_train)
print("X_train_emb.shape: ",X_train_emb.shape)
print("X_train: ",X_train)
print("X_train_encoded: ",X_train_encoded)
print("X_train_emb: ",X_train_emb)

7774                            everything was reasonable
6743                   and right now she can't answer any
1467     how easy is it for you to get a good night sleep
2927     I just didn't feel well I just couldn't I jus...
6192        therapist that she talked back oh my gosh wow
                              ...                        
5226     I like when my kids now my oldest kid you kno...
5390     I'm getting a little old to be thinking of a ...
860                            I haven't been back as yet
7603     I still wouldn't wish it on somebody else com...
7270     and I was able to handle two of them but the ...
Name: text, Length: 6405, dtype: object
X_train_emb.shape:  (6405, 64, 768)
X_train:  7774                            everything was reasonable
6743                   and right now she can't answer any
1467     how easy is it for you to get a good night sleep
2927     I just didn't feel well I just couldn't I jus...
6192        therapist that she talked back o

In [28]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from IPython.display import display, HTML

# Step 1: Define the text
text = "I have trouble sleeping."

# Step 2: Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Step 3: Tokenize and encode the sequence using BERT tokenizer with adjusted max_length
def tokenize_and_encode(text, max_len=64):  # Set max_length to 64
    inputs = tokenizer(
        text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    return inputs

# Re-encode the text with the new max_length
text_encoded = tokenize_and_encode(text, max_len=64)

# Step 4: Generate embeddings using BERT
def generate_embeddings(bert_model, encoded_inputs):
    outputs = bert_model(encoded_inputs['input_ids'], attention_mask=encoded_inputs['attention_mask'])
    return outputs.last_hidden_state

# Generate the embeddings
text_emb = generate_embeddings(bert_model, text_encoded)

# Convert the tokenized inputs to a readable format
encoded_str = {key: value.numpy() for key, value in text_encoded.items()}

# Display the encoded inputs with a blue background and red text in Jupyter notebook
html_code_encoded = f"""
<div style="background-color: turquoise; color: red; padding: 10px; font-family: monospace;">
<pre>{encoded_str}</pre>
</div>
"""
display(HTML(html_code_encoded))

# Optionally, you can also display the BERT embeddings in the same style
embedding_str = str(text_emb.numpy())
html_code_emb = f"""
<div style="background-color: green; pink: red; padding: 10px; font-family: monospace;">
<pre>{embedding_str}</pre>
</div>
"""
display(HTML(html_code_emb))


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w