In [1]:
import pandas as pd
from openai import OpenAI
from sklearn.model_selection import train_test_split
import os
import datetime
import flaml
import numpy as np
from sklearn.metrics import accuracy_score



## Embeddings from OpenAI model 

In [21]:
api_key = os.environ['OPENAI_API_KEY'] 
client = OpenAI(api_key=api_key)

# Load your JSONL data
file_path = './serialized_dataset/manual_template_serialization.jsonl'  # Replace with your actual file path

df = pd.read_json(file_path, lines=True)

# Function to get embeddings from OpenAI
def get_embeddings(text):
    response = client.embeddings.create(input=[text], model="text-embedding-ada-002")
    return response.data[0].embedding

# Convert 'output' to a binary classification target
df['is_fraud'] = df['output'].apply(lambda x: 1 if 'Yes' in x else 0)

# Get embeddings for the 'record' field
df['embeddings'] = df['record'].apply(get_embeddings)

In [22]:
df.head()

Unnamed: 0,record,output,is_fraud,embeddings
0,The date and time of transaction is 2019-11-20...,Is there a fraud? Yes,1,"[-0.0054815104231238365, -0.006616673897951841..."
1,The date and time of transaction is 2020-02-02...,Is there a fraud? Yes,1,"[-0.0014854425098747015, 0.0005804102984257042..."
2,The date and time of transaction is 2019-12-04...,Is there a fraud? Yes,1,"[0.004667007829993963, -0.006207969039678574, ..."
3,The date and time of transaction is 2019-03-04...,Is there a fraud? Yes,1,"[0.0035521159879863262, -0.011009876616299152,..."
4,The date and time of transaction is 2019-12-11...,Is there a fraud? Yes,1,"[-0.005899201612919569, 0.007397518027573824, ..."


In [27]:
df.to_csv('./serialized_dataset/embeddings/manual_template_embeddings.csv')

In [12]:
len(df['embeddings'][0])

1536

In [13]:
embedded_df = df[['embeddings', 'is_fraud']]

## AutoML

In [14]:
X = np.stack(embedded_df['embeddings'])
y = embedded_df['is_fraud'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

automl = flaml.AutoML()
automl_settings = {
    "time_budget": 600,  # Maximum time in seconds for AutoML to run
    "task": "binary",    # Task type: binary classification
    "log_file_name": "automl.log",  # Optional log file for tracking results
}

# Search for the best model and hyperparameters
automl.fit(X_train, y_train, **automl_settings)

# Make predictions on the test data
y_pred = automl.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Optionally, you can access other results, such as the best model found:
print(automl.best_estimator)
print(automl.best_config)
print(automl.best_loss)

[flaml.automl.logger: 01-09 21:06:10] {1679} INFO - task = binary
[flaml.automl.logger: 01-09 21:06:10] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 01-09 21:06:10] {1788} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 01-09 21:06:10] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 01-09 21:06:10] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-09 21:06:12] {2344} INFO - Estimated sufficient time budget=16882s. Estimated necessary time budget=389s.
[flaml.automl.logger: 01-09 21:06:12] {2391} INFO -  at 1.7s,	estimator lgbm's best error=0.4018,	best estimator lgbm's best error=0.4018
[flaml.automl.logger: 01-09 21:06:12] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-09 21:06:14] {2391} INFO -  at 3.2s,	estimator lgbm's best error=0.4018,	best estimator lgbm's best error=0.4018
[flaml.automl.logger: 01-09 21:06:14] {22