In [1]:
from src.api.chatgpt import ChatMessage, ChatBot
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

In [26]:
dataset = datasets.load_iris()
data_train, data_test, target_train, target_test = train_test_split(dataset.data, dataset.target)

In [75]:
llm = ChatBot()
message = ChatMessage('user', f'Generate new features for the classification problem. The problem description '
                    f'\n{dataset.DESCR}'
                    f'\nFirst five examples of data: {dataset.data[0:5, :]}'
                    f'\nFeature names are: {dataset.feature_names}'
                    f'\nTarget classes are {dataset.target_names} with labels {np.unique(dataset.target)}'
                    f'\nGenerate python code for dataset as a numpy array'
                    f'\nTemplate is:'
                    f'\n``` python'
                    f'\ndef func(dataset):'
                    f'\n    # your code here'
                    f'\n    return dataset_with_new_features'
                    f'\n```')
system_message = ChatMessage('system', 'You are an assistant helping with writing code for AutoML application. Your answers should only contain executable python code for feature generation. The incoming data is numpy array')

In [76]:
result = llm.get_completion([system_message, message])

In [121]:
import os
import sys
os.getcwd()
sys.path.insert(1, os.path.join(os.getcwd(), '..', 'generated_code'))
with open('../generated_code/test.py', 'w') as f:
    f.write(result.choices[0].message.content.strip('```python'))
from generated_code.test import add_new_features
add_new_features(data_train)
print(result.choices[0].message.content.strip('```python'))


import numpy as np

def add_new_features(dataset):
    # Get the sepal area (sepal length * sepal width)
    sepal_area = dataset[:, 0] * dataset[:, 1]
    
    # Get the petal area (petal length * petal width)
    petal_area = dataset[:, 2] * dataset[:, 3]
    
    # Get the ratio of petal length to petal width
    petal_ratio = dataset[:, 2] / dataset[:, 3]
    
    # Get the ratio of sepal length to sepal width
    sepal_ratio = dataset[:, 0] / dataset[:, 1]
    
    # Concatenate the new features 


In [15]:
def concat_new_features(dataset, feature_generation_function):
    features = feature_generation_function(dataset)
    features = np.array(features).T
    return np.hstack((dataset, features))

In [19]:
from generated_code.test import add_new_features
data_train_new = concat_new_features(data_train, add_new_features)
data_test_new = concat_new_features(data_test, add_new_features)

Test for catboost

In [27]:
model = CatBoostClassifier()
model.fit(data_train, target_train)
model_new = CatBoostClassifier()
model_new.fit(data_train_new, target_train)
accuracy_score(model.predict(data_test), target_test), accuracy_score(model_new.predict(data_test_new), target_test)

Learning rate set to 0.070767
0:	learn: 1.0144456	total: 16.9ms	remaining: 16.8s
1:	learn: 0.9448284	total: 17.7ms	remaining: 8.85s
2:	learn: 0.8799820	total: 18.7ms	remaining: 6.21s
3:	learn: 0.8236247	total: 19.6ms	remaining: 4.87s
4:	learn: 0.7793497	total: 20.4ms	remaining: 4.05s
5:	learn: 0.7276643	total: 21.1ms	remaining: 3.5s
6:	learn: 0.6833598	total: 21.9ms	remaining: 3.11s
7:	learn: 0.6408072	total: 22.8ms	remaining: 2.82s
8:	learn: 0.5985325	total: 23.3ms	remaining: 2.56s
9:	learn: 0.5653548	total: 24.1ms	remaining: 2.38s
10:	learn: 0.5346229	total: 24.9ms	remaining: 2.24s
11:	learn: 0.5051886	total: 25.8ms	remaining: 2.12s
12:	learn: 0.4802842	total: 26.6ms	remaining: 2.02s
13:	learn: 0.4544520	total: 27.4ms	remaining: 1.93s
14:	learn: 0.4269606	total: 27.8ms	remaining: 1.82s
15:	learn: 0.4048896	total: 28.7ms	remaining: 1.76s
16:	learn: 0.3844647	total: 29.5ms	remaining: 1.71s
17:	learn: 0.3664319	total: 30.4ms	remaining: 1.66s
18:	learn: 0.3543803	total: 31.3ms	remaining:

(0.9210526315789473, 0.8947368421052632)

Fedot with base settings

In [21]:
from fedot.api.main import Fedot
model = Fedot(problem='classification', timeout=10, preset='best_quality')
model.fit(features=data_train, target=target_train)
model_new = Fedot(problem='classification', timeout=10, preset='best_quality')
model_new.fit(features=data_train_new, target=target_train)

  features = features.applymap(lambda x: x.strip() if isinstance(x, str) else x)


2023-10-06 05:00:14,709 - AssumptionsHandler - Memory consumption for fitting of the initial pipeline in main session: current 85.7 MiB, max: 86.1 MiB
2023-10-06 05:00:14,715 - ApiComposer - Initial pipeline was fitted in 1.6 sec.
2023-10-06 05:00:14,775 - ApiComposer - AutoML configured. Parameters tuning: False. Time limit: 10 min. Set of candidate models: ['logit', 'bernb', 'normalization', 'isolation_forest_class', 'pca', 'scaling', 'qda', 'poly_features', 'dt', 'mlp', 'fast_ica', 'resample', 'lgbm', 'rf', 'knn'].
2023-10-06 05:00:14,787 - ApiComposer - Pipeline composition started.


Generations:   0%|          | 1/10000 [00:00<?, ?gen/s]

2023-10-06 05:00:26,969 - MultiprocessingDispatcher - 2 individuals out of 2 in previous population were evaluated successfully.
2023-10-06 05:00:52,610 - MultiprocessingDispatcher - 21 individuals out of 21 in previous population were evaluated successfully.
2023-10-06 05:00:55,069 - MultiprocessingDispatcher - 12 individuals out of 12 in previous population were evaluated successfully.
2023-10-06 05:00:57,381 - MultiprocessingDispatcher - 12 individuals out of 12 in previous population were evaluated successfully.
2023-10-06 05:01:00,280 - MultiprocessingDispatcher - 20 individuals out of 20 in previous population were evaluated successfully.
2023-10-06 05:01:04,375 - MultiprocessingDispatcher - 32 individuals out of 32 in previous population were evaluated successfully.
2023-10-06 05:01:11,197 - MultiprocessingDispatcher - 54 individuals out of 54 in previous population were evaluated successfully.
2023-10-06 05:01:18,563 - MultiprocessingDispatcher - 54 individuals out of 54 in pre

Generations:   0%|          | 1/10000 [06:28<?, ?gen/s]

2023-10-06 05:06:43,874 - ApiComposer - Model generation finished





2023-10-06 05:06:43,940 - FEDOT logger - Final pipeline was fitted
2023-10-06 05:06:43,943 - FEDOT logger - Final pipeline: {'depth': 1, 'length': 1, 'nodes': [logit]}
logit - {}
2023-10-06 05:06:43,945 - MemoryAnalytics - Memory consumption for finish in main session: current 88.7 MiB, max: 89.4 MiB


  features = features.applymap(lambda x: x.strip() if isinstance(x, str) else x)


2023-10-06 05:06:45,286 - AssumptionsHandler - Memory consumption for fitting of the initial pipeline in main session: current 0.3 MiB, max: 0.7 MiB
2023-10-06 05:06:45,288 - ApiComposer - Initial pipeline was fitted in 1.0 sec.
2023-10-06 05:06:45,295 - ApiComposer - AutoML configured. Parameters tuning: False. Time limit: 10 min. Set of candidate models: ['logit', 'bernb', 'normalization', 'isolation_forest_class', 'pca', 'scaling', 'qda', 'poly_features', 'dt', 'mlp', 'fast_ica', 'resample', 'lgbm', 'rf', 'knn'].
2023-10-06 05:06:45,299 - ApiComposer - Pipeline composition started.


Generations:   0%|          | 1/10000 [00:00<?, ?gen/s]

2023-10-06 05:06:46,634 - MultiprocessingDispatcher - 2 individuals out of 2 in previous population were evaluated successfully.
2023-10-06 05:06:52,010 - MultiprocessingDispatcher - 21 individuals out of 21 in previous population were evaluated successfully.
2023-10-06 05:06:54,426 - MultiprocessingDispatcher - 12 individuals out of 12 in previous population were evaluated successfully.
2023-10-06 05:06:56,719 - MultiprocessingDispatcher - 12 individuals out of 12 in previous population were evaluated successfully.
2023-10-06 05:06:58,885 - MultiprocessingDispatcher - 20 individuals out of 20 in previous population were evaluated successfully.
2023-10-06 05:07:03,852 - MultiprocessingDispatcher - 32 individuals out of 32 in previous population were evaluated successfully.
2023-10-06 05:07:10,893 - MultiprocessingDispatcher - 54 individuals out of 54 in previous population were evaluated successfully.
2023-10-06 05:07:24,343 - MultiprocessingDispatcher - 54 individuals out of 54 in pre

Generations:   0%|          | 1/10000 [09:56<?, ?gen/s]


2023-10-06 05:16:42,305 - ApiComposer - Model generation finished
2023-10-06 05:16:45,506 - FEDOT logger - Final pipeline was fitted
2023-10-06 05:16:45,511 - FEDOT logger - Final pipeline: {'depth': 1, 'length': 1, 'nodes': [rf]}
rf - {'n_jobs': 8}
2023-10-06 05:16:45,517 - MemoryAnalytics - Memory consumption for finish in main session: current 2.5 MiB, max: 3.2 MiB


{'depth': 1, 'length': 1, 'nodes': [rf]}

In [25]:
accuracy_score(model.predict(data_test), target_test), accuracy_score(model_new.predict(data_test_new), target_test)

  features = features.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  features = features.applymap(lambda x: x.strip() if isinstance(x, str) else x)


(0.9210526315789473, 0.9210526315789473)