In [13]:
import time
import logging

import keras
import openai
import dotenv
import bentoml
import numpy as np
import pandas as pd
from keras import layers

# Create a logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create a console handler and set the log level
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# Create a formatter and set it to the console handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)

# Add the console handler to the logger
logger.addHandler(console_handler)

In [2]:
dotenv.load_dotenv()

# Load Data

In [3]:
raw_df = pd.read_csv("conversations.csv")[["message", "message_type"]]
raw_df.rename(columns={
    "message": "text", 
    "message_type": "label"
}, inplace=True)
raw_df.head()

Unnamed: 0,text,label
0,"Hey guys, I'm having trouble with my linear re...",Question
1,What's your data look like? Are you using any ...,Comment
2,I'm using a dataset with 1000 samples and 10 f...,Answer
3,Have you checked for multicollinearity? Maybe ...,Comment
4,"Yeah, I did check for multicollinearity and re...",Answer


# Clean Data

In [4]:
raw_df['label'].value_counts()

label
Answer          605
Question        547
Comment         261
question         20
answer           19
comment           8
Response          5
Advice            3
Spam              2
Resource          1
Suggestion        1
Appreciation      1
Name: count, dtype: int64

In [5]:
allowed_labels = {'question', 'answer', 'comment'}
raw_df['label'] = raw_df['label'].str.lower()
raw_df = raw_df[raw_df['label'].isin(allowed_labels)]
raw_df['label'].value_counts()

label
answer      624
question    567
comment     269
Name: count, dtype: int64

# Embed Data

In [None]:
vectors = []
embedding_model = "text-embedding-3-large"

client = openai.OpenAI()

number_of_samples = len(raw_df)

for idx, text in enumerate(raw_df['text']):
    print(f"{idx}/{number_of_samples} - {text}")
    try:
        response = client.embeddings.create(
            input=text,
            model=embedding_model,
        )
        vector = response.data[0].embedding
        vectors.append(vector)
    except openai.APIError:
        logger.error(f"Failed to embed text: {text}")
        time.sleep(5)
        continue
    except openai.error.ServiceUnavailableError:
        logger.error("OpenAI Service is unavailable")
        break
    
vectors_arr = np.asarray(vectors, dtype=np.float64)
np.save("vectors.npy", vectors_arr)

# Train-test Split

In [14]:
def label_encoder(label):
    return {
        "question": 0,
        "answer": 1,
        "comment": 2
    }[label]

vectors_arr = np.load('vectors.npy')
df_full = raw_df.copy()
df_full['label'] = raw_df['label']
df_full['label_id'] = df_full['label'].apply(label_encoder)
df_full['vector'] = list(vectors_arr)

train_size = 0.8

df_train = df_full.sample(frac=train_size, random_state=42)
df_test = df_full.drop(df_train.index).reset_index(drop=True)

df_train.head()

Unnamed: 0,text,label,label_id,vector
894,"Hi! Yeah, I'd be happy to help. K-means is a t...",answer,1,"[-0.012134386226534843, -9.189688717015088e-05..."
1109,"Data analysts usually need strong SQL, spreads...",answer,1,"[0.0025824133772403, -0.022944526746869087, -0..."
413,"Awesome, thanks so much! I'll definitely look ...",comment,2,"[0.022549433633685112, 0.03613811731338501, -0..."
524,"Hey everyone, I'm just starting to learn about...",question,0,"[0.0012090475065633655, 0.02343117631971836, -..."
1039,"Awesome, thanks for the tips! I'll make sure t...",question,0,"[0.012299810536205769, 0.01986454799771309, -0..."


# Define Model

In [15]:
embedding_size = len(df_train['vector'].iloc[0])
num_classes = len(df_full['label'].unique())
input_layer = keras.Input((embedding_size, ))
hidden_layer = layers.Dense(embedding_size, activation='relu')(input_layer)
output_layer = layers.Dense(num_classes, activation='sigmoid')(hidden_layer)
classifier = keras.Model(
    inputs=[
        input_layer
    ],
    outputs=output_layer,
)
classifier.summary()

classifier.compile(
    # loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Train Model

In [16]:
NUM_EPOCHS = 40
BATCH_SIZE = 32

# Split the x and y components of the train and validation subsets.
y_train = df_train['label_id']
x_train = np.stack(df_train['vector'])
y_test = df_test['label_id']
x_test = np.stack(df_test['vector'])

# Train the model for the desired number of epochs.
callback = keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)

class_counts = df_train['label_id'].value_counts()
total_count = class_counts.sum()
class_weight = {
    label: round(total_count / count, 4)
    for label, count in class_counts.items()
}
print("Class Weights: ", class_weight)

history = classifier.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test, y_test),
    callbacks=[callback],
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    # class_weight=class_weight,
)

# y_val = df_val['Encoded Label']
# x_val = np.stack(df_val['Embeddings'])

# print(classifier.evaluate(x=x_val, y=y_val, return_dict=True))

# y_hat = classifier.predict(x=x_val)
# y_hat = np.argmax(y_hat, axis=1)


Class Weights:  {1: 2.3407, 0: 2.6013, 2: 5.3091}
Epoch 1/40
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 123ms/step - accuracy: 0.7553 - loss: 0.6151 - val_accuracy: 0.9110 - val_loss: 0.2603
Epoch 2/40
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 120ms/step - accuracy: 0.9430 - loss: 0.1549 - val_accuracy: 0.9110 - val_loss: 0.2317
Epoch 3/40
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 125ms/step - accuracy: 0.9708 - loss: 0.0927 - val_accuracy: 0.9178 - val_loss: 0.2155
Epoch 4/40
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 136ms/step - accuracy: 0.9809 - loss: 0.0545 - val_accuracy: 0.9247 - val_loss: 0.2181
Epoch 5/40
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 132ms/step - accuracy: 0.9908 - loss: 0.0318 - val_accuracy: 0.9144 - val_loss: 0.2535
Epoch 6/40
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 116ms/step - accuracy: 0.9971 - loss: 0.0171 - val_accuracy: 0.92

# Manual Evaluation

In [17]:
text = "You need to run pip install"
text = "How do I install a package in Python?"
vector = client.embeddings.create(
    input=text,
    model=embedding_model,
).data[0].embedding

vector = np.asarray(vector, dtype=np.float64)

y_hat = classifier.predict(x=vector.reshape(1, -1))
y_hat_max = np.argmax(y_hat, axis=1)[0]

label = {
    0: "question",
    1: "answer",
    2: "comment",
}[y_hat_max]

print(label)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
question


# Export Model

In [26]:
bento_model = bentoml.tensorflow.save_model("qa_classifier", classifier)

INFO:tensorflow:Assets written to: /var/folders/t5/3qb7sc1j15n13hjcgf4d857h0000gn/T/tmpny827sqgbentoml_model_qa_classifier/assets


INFO:tensorflow:Assets written to: /var/folders/t5/3qb7sc1j15n13hjcgf4d857h0000gn/T/tmpny827sqgbentoml_model_qa_classifier/assets


In [27]:
dir(bento_model)

['_Model__fs',
 '__abstractmethods__',
 '__annotations__',
 '__attrs_attrs__',
 '__attrs_init__',
 '__attrs_own_setattr__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_compress',
 '_custom_objects',
 '_export_ext',
 '_export_name',
 '_from_compressed',
 '_fs',
 '_info',
 '_model',
 '_runnable',
 '_tag',
 '_write_custom_objects',
 '_write_info',
 'create',
 'creation_time',
 'custom_objects',
 'enter_cloudpickle_context',
 'exit_cloudpickle_context',
 'export',
 'file_size',
 'flush',
 'from_fs',
 'get_typename',
 'guess_format',
 'import_from',
 'info',
 'load_model',
 'path',
 'path

In [29]:
dir(bento_model.load_model())

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_trackable_child',
 '_add_variable_with_custom_getter',
 '_build_shapes_dict',
 '_checkpoint_dependencies',
 '_copy_trackable_to_cpu',
 '_default_save_signature',
 '_deferred_dependencies',
 '_delete_tracking',
 '_deserialization_dependencies',
 '_deserialize_from_proto',
 '_export_to_saved_model_graph',
 '_gather_saveables_for_checkpoint',
 '_handle_deferred_dependencies',
 '_inbound_nodes',
 '_layers',
 '_lookup_dependency',
 '_losses',
 '_losses_override',
 '_maybe_initialize_trackable',
 '_name_based_attribute_restore',
 '_name_based_restores',
 '_no_dependency',
 '_object_ide

In [30]:
model = bentoml.tensorflow.load_model("qa_classifier")

In [32]:
model.load_model()

AttributeError: '_UserObject' object has no attribute 'load_model'