**In order to get the fastest predictions you need to enable GPUs for the notebook:**
* Navigate to Edit→Notebook Settings
* select GPU from the Hardware Accelerator drop-down
(https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=oM_8ELnJq_wd)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install caafe

Collecting caafe
  Downloading caafe-0.1.6-py3-none-any.whl (23 kB)
Collecting openai (from caafe)
  Downloading openai-1.3.7-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting openml==0.12.0 (from caafe)
  Downloading openml-0.12.0.tar.gz (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.1/116.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tabpfn (from caafe)
  Downloading tabpfn-0.1.9-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.6/156.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting liac-arff>=2.4.0 (from openml==0.12.0->caafe)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml==0.12.0->caafe)
  Downloading xmltodict-0.13.0-py2.py3-none

In [None]:
from caafe import CAAFEClassifier # Automated Feature Engineering for tabular datasets
from tabpfn import TabPFNClassifier # Fast Automated Machine Learning method for small tabular datasets
from sklearn.ensemble import RandomForestClassifier

import os
import openai
import torch
from caafe import data
from sklearn.metrics import accuracy_score
from tabpfn.scripts import tabular_metrics
from functools import partial

  and should_run_async(code)


In [None]:
openai.api_key = "YOUR_API_KEY"

In [None]:
metric_used = tabular_metrics.auc_metric
cc_test_datasets_multiclass = data.load_all_data()

Number of datasets: 10
Loading balance-scale 11 ..
Loading breast-w 15 ..
Loading cmc 23 ..
Loading credit-g 31 ..
Loading diabetes 37 ..
Loading tic-tac-toe 50 ..
Loading eucalyptus 188 ..
Loading pc1 1068 ..
Loading airlines 1169 ..
Loading jungle_chess_2pcs_raw_endgame_complete 41027 ..
health-insurance-lead-prediction-raw-data at datasets_kaggle/health-insurance-lead-prediction-raw-data/Health Insurance Lead Prediction Raw Data.csv not found, skipping...
pharyngitis at datasets_kaggle/pharyngitis/pharyngitis.csv not found, skipping...
spaceship-titanic at datasets_kaggle/spaceship-titanic/train.csv not found, skipping...
playground-series-s3e12 at datasets_kaggle/playground-series-s3e12/train.csv not found, skipping...
Downsampling balance-scale to 20.0% of samples
Downsampling breast-w to 10.0% of samples
Downsampling tic-tac-toe to 10.0% of samples
Using initial description (tried reading data//dataset_descriptions/openml_airlines.txt)


In [None]:
ds = cc_test_datasets_multiclass[5]
ds, df_train, df_test, _, _ = data.get_data_split(ds, seed=0)
target_column_name = ds[4][-1]
dataset_description = ds[-1]
ds[0]

Using initial description (tried reading data//dataset_descriptions/openml_tic-tac-toe.txt)


'tic-tac-toe'

In [None]:
from caafe.preprocessing import make_datasets_numeric
df_train, df_test = make_datasets_numeric(df_train, df_test, target_column_name)
train_x, train_y = data.get_X_y(df_train, target_column_name)
test_x, test_y = data.get_X_y(df_test, target_column_name)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Accuracy before CAAFE 0.5833333333333334


In [None]:
### Setup Base Classifier

# clf_no_feat_eng = RandomForestClassifier()
clf_no_feat_eng = TabPFNClassifier(device=('cuda' if torch.cuda.is_available() else 'cpu'), N_ensemble_configurations=4)
clf_no_feat_eng.fit = partial(clf_no_feat_eng.fit, overwrite_warning=True)

clf_no_feat_eng.fit(train_x, train_y)
pred = clf_no_feat_eng.predict(test_x)
acc = accuracy_score(pred, test_y)
print(f'Accuracy before CAAFE {acc}')

In [None]:
### Setup and Run CAAFE - This will be billed to your OpenAI Account!

caafe_clf = CAAFEClassifier(base_classifier=clf_no_feat_eng,
                            llm_model="gpt-4",
                            iterations=2)

caafe_clf.fit_pandas(df_train,
                     target_column_name=target_column_name,
                     dataset_description=dataset_description)

pred = caafe_clf.predict(df_test)
acc = accuracy_score(pred, test_y)
print(f'Accuracy after CAAFE {acc}')

*Dataset description:*
 **Tic-Tac-Toe Endgame database**  
This database encodes the complete set of possible board configurations at the end of tic-tac-toe games, where "x" is assumed to have played first.  The target concept is "win for x" (i.e., true when "x" has one of 8 possible ways to create a "three-in-a-row").  


*Iteration 1*
```python

# Feature name: row_win
# Usefulness: This feature checks if there is a winning row for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'top-middle-square': [0.0, 0.0, 1.0], 'top-right-square': [1.0, 0.0, 1.0], ...
df['row_win'] = ((df['top-left-square'] == df['top-middle-square']) & (df['top-middle-square'] == df['top-right-square']) & (df['top-left-square'] == 1.0)) | \
                ((df['middle-left-square'] == df['middle-middle-square']) & (df['middle-middle-square'] == df['middle-right-square']) & (df['middle-left-square'] == 1.0)) | \
                ((df['bottom-left-square'] == df['bottom-middle-square']) & (df['bottom-middle-square'] == df['bottom-right-square']) & (df['bottom-left-square'] == 1.0))

```
Performance before adding features ROC 0.732, ACC 0.596.
Performance after adding features ROC 0.732, ACC 0.691.
Improvement ROC 0.000, ACC 0.094.
The code was executed and changes to ´df´ were kept.




*Iteration 2*
```python

# Feature name: col_win
# Usefulness: This feature checks if there is a winning column for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'middle-left-square': [1.0, 0.0, 2.0], 'bottom-left-square': [2.0, 1.0, 1.0], ...
df['col_win'] = ((df['top-left-square'] == df['middle-left-square']) & (df['middle-left-square'] == df['bottom-left-square']) & (df['top-left-square'] == 1.0)) | \
                ((df['top-middle-square'] == df['middle-middle-square']) & (df['middle-middle-square'] == df['bottom-middle-square']) & (df['top-middle-square'] == 1.0)) | \
                ((df['top-right-square'] == df['middle-right-square']) & (df['middle-right-square'] == df['bottom-right-square']) & (df['top-right-square'] == 1.0))

```
Performance before adding features ROC 0.732, ACC 0.691.
Performance after adding features ROC 0.831, ACC 0.781.
Improvement ROC 0.099, ACC 0.091.
The code was executed and changes to ´df´ were kept.



Accuracy after CAAFE 0.7083333333333334


In [None]:
print(caafe_clf.code)


# Feature name: row_win
# Usefulness: This feature checks if there is a winning row for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'top-middle-square': [0.0, 0.0, 1.0], 'top-right-square': [1.0, 0.0, 1.0], ...
df['row_win'] = ((df['top-left-square'] == df['top-middle-square']) & (df['top-middle-square'] == df['top-right-square']) & (df['top-left-square'] == 1.0)) | \
                ((df['middle-left-square'] == df['middle-middle-square']) & (df['middle-middle-square'] == df['middle-right-square']) & (df['middle-left-square'] == 1.0)) | \
                ((df['bottom-left-square'] == df['bottom-middle-square']) & (df['bottom-middle-square'] == df['bottom-right-square']) & (df['bottom-left-square'] == 1.0))

# Feature name: col_win
# Usefulness: This feature checks if there is a winning column for "x" in the tic-tac-toe board.
# Input samples: 'top-left-square': [2.0, 2.0, 2.0], 'middle-left-square': [1.0, 0.0, 2.0], 'bottom-left-square': [2.0, 1

### Optional download Kaggle data

In [None]:
#!ls ~/.kaggle/kaggle.json

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

kaggle_api_token = {"username":"","key":""}

import json
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(kaggle_api_token, file)

!chmod 600 ~/.kaggle/kaggle.json
!mkdir datasets_kaggle/

from caafe import data

for (name, _, _, user) in data.kaggle_dataset_ids:
    !kaggle datasets download -d {user}/{name}
    !mkdir datasets_kaggle/{name}
    !unzip {name}.zip -d datasets_kaggle/{name}

# Accept rules at https://www.kaggle.com/c/spaceship-titanic/rules
for name in data.kaggle_competition_ids:
    print(name)
    !kaggle competitions download -c {name}
    !mkdir datasets_kaggle/{name}
    !unzip {name}.zip -d datasets_kaggle/{name}