**In order to get the fastest predictions you need to enable GPUs for the notebook:**
* Navigate to Edit→Notebook Settings
* select GPU from the Hardware Accelerator drop-down
(https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=oM_8ELnJq_wd)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
!pip install caafe



In [8]:
from caafe import CAAFEClassifier # Automated Feature Engineering for tabular datasets
from tabpfn import TabPFNClassifier # Fast Automated Machine Learning method for small tabular datasets
from sklearn.ensemble import RandomForestClassifier

import os
import openai
import torch
from caafe import data
from sklearn.metrics import accuracy_score
from tabpfn.scripts import tabular_metrics
from functools import partial

import pandas as pd

In [None]:
openai.api_key = "YOUR_API_KEY"

In [5]:
metric_used = tabular_metrics.auc_metric
cc_test_datasets_multiclass = data.load_all_data()

Number of datasets: 10
Loading balance-scale 11 ..
Loading breast-w 15 ..
Loading cmc 23 ..
Loading credit-g 31 ..
Loading diabetes 37 ..
Loading tic-tac-toe 50 ..
Loading eucalyptus 188 ..
Loading pc1 1068 ..
Loading airlines 1169 ..
Loading jungle_chess_2pcs_raw_endgame_complete 41027 ..
health-insurance-lead-prediction-raw-data at datasets_kaggle/health-insurance-lead-prediction-raw-data/Health Insurance Lead Prediction Raw Data.csv not found, skipping...
pharyngitis at datasets_kaggle/pharyngitis/pharyngitis.csv not found, skipping...
spaceship-titanic at datasets_kaggle/spaceship-titanic/train.csv not found, skipping...
playground-series-s3e12 at datasets_kaggle/playground-series-s3e12/train.csv not found, skipping...
Downsampling balance-scale to 20.0% of samples
Downsampling breast-w to 10.0% of samples
Downsampling tic-tac-toe to 10.0% of samples


In [6]:
def save_config(dataset_name,target, task_type, data_out_path):
    config_strs = [f"- name: {dataset_name}",
                       "  dataset:",
                       f"    train: \'{{user}}/data/{dataset_name}/{dataset_name}_train.csv\'",
                       f"    test: \'{{user}}/data/{dataset_name}/{dataset_name}_test.csv\'",
                       f"    target: {target}",
                       f"    type: {task_type}",
                       "  folds: 1",
                       "\n"]
    config_str = "\n".join(config_strs)

    yaml_file_local = f'{data_out_path}/{dataset_name}/{dataset_name}.yaml'
    f_local = open(yaml_file_local, 'w')
    f_local.write("--- \n \n")
    f_local.write(config_str)
    f_local.close()

    # yaml_file_benchmark = f'{setting_out_path}/{dataset_name}.yaml'
    # f = open(yaml_file_benchmark, 'w')
    # f.write("--- \n \n")
    # f.write(config_str)
    # f.close() 

In [10]:
data_path = "../data/"

for d in cc_test_datasets_multiclass:   
    ds_name = d[0]
    ds_path = f"{data_path}/{ds_name}"
    os.makedirs(ds_path, exist_ok=True)
    ds, df_train, df_test, _, _ = data.get_data_split(d, seed=0)

    target_col = target_column_name = ds[4][-1]

    n_classes = df_train[target_col].nunique()
    if n_classes == 2:
        task_type = "binary"
    elif n_classes < 300 :
        task_type = "multiclass"
    else:
        task_type = "regression"       

    save_config(dataset_name=ds_name, target=target_col, data_out_path=data_path, task_type=task_type)
    df_train.to_csv(f'{ds_path}/{ds_name}_train.csv', index=False)
    df_test.to_csv(f'{ds_path}/{ds_name}_test.csv', index=False)

    df_all = pd.concat([df_train, df_test])
    df_all.to_csv(f'{ds_path}/{ds_name}.csv', index=False)

    print(f"$CMD {ds_name} {task_type} test")



Using initial description (tried reading data//dataset_descriptions/openml_balance-scale.txt)
$CMD balance-scale multiclass test
Using initial description (tried reading data//dataset_descriptions/openml_breast-w.txt)
$CMD breast-w binary test
Using initial description (tried reading data//dataset_descriptions/openml_cmc.txt)
$CMD cmc multiclass test
Using initial description (tried reading data//dataset_descriptions/openml_credit-g.txt)
$CMD credit-g binary test
Using initial description (tried reading data//dataset_descriptions/openml_diabetes.txt)
$CMD diabetes binary test
Using initial description (tried reading data//dataset_descriptions/openml_tic-tac-toe.txt)
$CMD tic-tac-toe binary test
Using initial description (tried reading data//dataset_descriptions/openml_eucalyptus.txt)
$CMD eucalyptus multiclass test
Using initial description (tried reading data//dataset_descriptions/openml_pc1.txt)
$CMD pc1 binary test
Using initial description (tried reading data//dataset_descriptions/

In [32]:
ds = cc_test_datasets_multiclass[0]
ds, df_train, df_test, _, _ = data.get_data_split(ds, seed=0)
target_column_name = ds[4][-1]
dataset_description = ds[-1]
ds[0]

Using initial description (tried reading data//dataset_descriptions/openml_tic-tac-toe.txt)


'tic-tac-toe'

In [None]:
from caafe.preprocessing import make_datasets_numeric
df_train, df_test = make_datasets_numeric(df_train, df_test, target_column_name)
train_x, train_y = data.get_X_y(df_train, target_column_name)
test_x, test_y = data.get_X_y(df_test, target_column_name)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Accuracy before CAAFE 0.5833333333333334


In [None]:
### Setup Base Classifier

# clf_no_feat_eng = RandomForestClassifier()
clf_no_feat_eng = TabPFNClassifier(device=('cuda' if torch.cuda.is_available() else 'cpu'), N_ensemble_configurations=4)
clf_no_feat_eng.fit = partial(clf_no_feat_eng.fit, overwrite_warning=True)

clf_no_feat_eng.fit(train_x, train_y)
pred = clf_no_feat_eng.predict(test_x)
acc = accuracy_score(pred, test_y)
print(f'Accuracy before CAAFE {acc}')

In [None]:
### Setup and Run CAAFE - This will be billed to your OpenAI Account!

caafe_clf = CAAFEClassifier(base_classifier=clf_no_feat_eng,
                            llm_model="gpt-4",
                            iterations=2)

caafe_clf.fit_pandas(df_train,
                     target_column_name=target_column_name,
                     dataset_description=dataset_description)

pred = caafe_clf.predict(df_test)
acc = accuracy_score(pred, test_y)
print(f'Accuracy after CAAFE {acc}')

*Dataset description:*
 **Tic-Tac-Toe Endgame database**  
This database encodes the complete set of possible board configurations at the end of tic-tac-toe games, where "x" is assumed to have played first.  The target concept is "win for x" (i.e., true when "x" has one of 8 possible ways to create a "three-in-a-row").  


*Iteration 1*
```python

# Feature name: row_win
# Usefulness: This feature checks if there is a winning row for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'top-middle-square': [0.0, 0.0, 1.0], 'top-right-square': [1.0, 0.0, 1.0], ...
df['row_win'] = ((df['top-left-square'] == df['top-middle-square']) & (df['top-middle-square'] == df['top-right-square']) & (df['top-left-square'] == 1.0)) | \
                ((df['middle-left-square'] == df['middle-middle-square']) & (df['middle-middle-square'] == df['middle-right-square']) & (df['middle-left-square'] == 1.0)) | \
                ((df['bottom-left-square'] == df['bottom-middle-square']) & (df['bottom-middle-square'] == df['bottom-right-square']) & (df['bottom-left-square'] == 1.0))

```
Performance before adding features ROC 0.732, ACC 0.596.
Performance after adding features ROC 0.732, ACC 0.691.
Improvement ROC 0.000, ACC 0.094.
The code was executed and changes to ´df´ were kept.




*Iteration 2*
```python

# Feature name: col_win
# Usefulness: This feature checks if there is a winning column for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'middle-left-square': [1.0, 0.0, 2.0], 'bottom-left-square': [2.0, 1.0, 1.0], ...
df['col_win'] = ((df['top-left-square'] == df['middle-left-square']) & (df['middle-left-square'] == df['bottom-left-square']) & (df['top-left-square'] == 1.0)) | \
                ((df['top-middle-square'] == df['middle-middle-square']) & (df['middle-middle-square'] == df['bottom-middle-square']) & (df['top-middle-square'] == 1.0)) | \
                ((df['top-right-square'] == df['middle-right-square']) & (df['middle-right-square'] == df['bottom-right-square']) & (df['top-right-square'] == 1.0))

```
Performance before adding features ROC 0.732, ACC 0.691.
Performance after adding features ROC 0.831, ACC 0.781.
Improvement ROC 0.099, ACC 0.091.
The code was executed and changes to ´df´ were kept.



Accuracy after CAAFE 0.7083333333333334


In [None]:
print(caafe_clf.code)


# Feature name: row_win
# Usefulness: This feature checks if there is a winning row for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'top-middle-square': [0.0, 0.0, 1.0], 'top-right-square': [1.0, 0.0, 1.0], ...
df['row_win'] = ((df['top-left-square'] == df['top-middle-square']) & (df['top-middle-square'] == df['top-right-square']) & (df['top-left-square'] == 1.0)) | \
                ((df['middle-left-square'] == df['middle-middle-square']) & (df['middle-middle-square'] == df['middle-right-square']) & (df['middle-left-square'] == 1.0)) | \
                ((df['bottom-left-square'] == df['bottom-middle-square']) & (df['bottom-middle-square'] == df['bottom-right-square']) & (df['bottom-left-square'] == 1.0))

# Feature name: col_win
# Usefulness: This feature checks if there is a winning column for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'middle-left-square': [1.0, 0.0, 2.0], 'bottom-left-square': [2.0, 1

### Optional download Kaggle data

In [16]:
#!ls ~/.kaggle/kaggle.json

# !mkdir ~/.kaggle
# !touch ~/.kaggle/kaggle.json

kaggle_api_token = {"username":"","key":""}

import json
with open('kaggle.json', 'w') as file:
    json.dump(kaggle_api_token, file)

    print("========================")

# !chmod 600 ~/.kaggle/kaggle.json
# !mkdir datasets_kaggle/

from caafe import data

for (name, _, _, user) in data.kaggle_dataset_ids:
    !kaggle datasets download -d {user}/{name}
    !mkdir datasets_kaggle/{name}
    !unzip {name}.zip -d datasets_kaggle/{name}

# Accept rules at https://www.kaggle.com/c/spaceship-titanic/rules
for name in data.kaggle_competition_ids:
    print(name)
    !kaggle competitions download -c {name}
    !mkdir datasets_kaggle/{name}
    !unzip {name}.zip -d datasets_kaggle/{name}

Traceback (most recent call last):
  File "/home/saeed/Documents/Github/CatDB/envCatDB/bin/kaggle", line 33, in <module>
    sys.exit(load_entry_point('kaggle==1.6.6', 'console_scripts', 'kaggle')())
  File "/home/saeed/Documents/Github/CatDB/envCatDB/bin/kaggle", line 25, in importlib_load_entry_point
    return next(matches).load()
  File "/usr/lib/python3.10/importlib/metadata/__init__.py", line 171, in load
    module = import_module(match.group('module'))
  File "/usr/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstra