In [1]:
!pip install tpot --quiet

In [2]:
import pandas as pd
import numpy as np
import warnings
from tpot import TPOTClassifier
from sklearn.model_selection import StratifiedShuffleSplit, RepeatedStratifiedKFold

warnings.filterwarnings('ignore')

In [3]:
artists_n_genres = pd.read_csv('../input/artistspopulargenre/artists_n_genres.csv')
artists_n_genres.head()

In [4]:
artists_n_genres.drop(['genres', 'Unnamed: 0'], axis=1, inplace=True)
artists_n_genres.head()

In [5]:
artists_n_genres = artists_n_genres[artists_n_genres['popular_genre'].map(artists_n_genres['popular_genre'].value_counts()) > 100]

In [6]:
artists_n_genres['popular_genre_code'] = artists_n_genres['popular_genre'].astype('category').cat.codes
artists_n_genres.head()

In [7]:
stratified_split = StratifiedShuffleSplit(
    test_size=.3,
    random_state=333,
    n_splits=2
)

X_train, X_test, y_train, y_test = ([] for i in range(4))

X = artists_n_genres.drop(['popular_genre_code'], axis=1)
y = artists_n_genres['popular_genre_code']


for train_idx, test_idx in stratified_split.split(np.zeros(len(X)), y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [8]:
X_test['popular_genre'].value_counts()

In [9]:
generations=3
population_size=5
cv = RepeatedStratifiedKFold(n_splits=5, random_state=1)

pipeline_optimizer = TPOTClassifier(
    verbosity=2, 
    random_state=42,
    memory='auto',
    n_jobs=-1,
    scoring='f1',
    cv=cv,
    template='Transformer-Selector-Classifier',
    config_dict='TPOT light',
)

In [10]:
pipeline_optimizer.fit(X_train, y_train)