In [14]:
import pandas as pd
import numpy as np
import re
from random import sample, choice
from sklearn.model_selection import train_test_split
import warnings
import os
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm


In [15]:
def multiple_max(df):
    df1 = df.drop('dataset', axis=1)
    maxes = {}      # {dataset: [max1, max2, ...]}
    for i in range(df.shape[0]):
        row = np.array(df1.iloc[i])
        row_maxes = np.argwhere(row == np.amax(row)).flatten().tolist()
        maxes[df.iloc[i, 0]] = row_maxes
        # maxes[df.iloc[i, 0]] = df1.columns[row_maxes].tolist() # if you want col names
    return maxes

pre = pd.read_csv("/users/guest/j/jhiggin6/Documents/Thesis/emp_results.csv")
pre['dataset'] = pre['dataset'].apply(lambda x: int(re.split(r'\.arff', x)[0]))

optimized_cols = []
default_cols = []

for col in pre.columns:
    if col == 'dataset':
        optimized_cols.append(col)
        default_cols.append(col)
    if "+" in col:
        optimized_cols.append(col)
    elif "-" in col:
        default_cols.append(col)

optim_maxes = multiple_max(pre[optimized_cols])
default_maxes = multiple_max(pre[default_cols])

# optim_maxes = {dataset: [max1, max2, ..., max466]}
# default_maxes = {dataset: [max1, max2, ..., max466]}

In [16]:
def train_meta_model(sk_algorithm, iters, X, y, valid_datasets, maxes):
    accuracies = []
    for i in range(iters):
        X_train, X_test, y_train, y_test, ds_train, ds_test = train_test_split(X, y, valid_datasets,
                                                        test_size=0.33, stratify=y)
        model = sk_algorithm
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        correct = 0
        for ds, prediction in zip(ds_test, y_pred):
            if prediction in maxes[ds]:
                correct += 1
        accuracies.append(correct/len(ds_test))
    return accuracies

In [17]:
# Variables for meta-model
models = ["Metadataset", "RandomForest", "LogisticRegression", "SVC", "KNeighbors",
          "GaussianNB", "DecisionTree", "AdaBoost", "GradientBoosting",
          "MLP", "Bagging", "ExtraTrees",  "Voting"]
results = []
maxes = optim_maxes
iters = 1000

meta_datasets_path = "/users/guest/j/jhiggin6/Documents/Thesis/meta_datasets"
meta_datasets = {}
for meta_dataset in os.listdir(meta_datasets_path):
    meta_datasets[meta_dataset] = pd.read_csv(os.path.join(meta_datasets_path, meta_dataset))

for df_name, df in tqdm(meta_datasets.items(), desc="Meta-datasets"):
    # df_name, df = choice(list(meta_datasets.items()))
    print('df_name:', df_name)
    results_arr = [df_name]
    X = []
    y = []
    valid_datasets = []

    # df_name, df = choice(list(meta_datasets.items()))
    df.replace(np.nan, 0, inplace=True)
    df.replace(np.inf, 0, inplace=True)
    # print('metadataset:', df_name)

    for i in range(df.shape[0]):
        if df.loc[i, 'dataset'] in maxes:
            y.append(sample(maxes[df.loc[i, 'dataset']],k=1)[0])
            X.append(df.iloc[i, 1:].tolist())
            valid_datasets.append(df.loc[i, 'dataset'])

    X, y = np.array(X), np.array(y)

    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    rf_accuracies = train_meta_model(rf, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(rf_accuracies))
    # print(f'rf accuracy: {round(np.mean(rf_accuracies)*100,2)}%', flush=True)

    # Logistic Regression
    lr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    lr_accuracies = train_meta_model(lr, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(lr_accuracies))
    # print(f'lr accuracy: {round(np.mean(lr_accuracies)*100,2)}%', flush=True)

    # SVM
    svm = SVC(gamma='auto')
    svm_accuracies = train_meta_model(svm, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(svm_accuracies))
    # print(f'svm accuracy: {round(np.mean(svm_accuracies)*100,2)}%', flush=True)

    # KNN
    knn = KNeighborsClassifier(n_neighbors=3)
    knn_accuracies = train_meta_model(knn, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(knn_accuracies))
    # print(f'knn accuracy: {round(np.mean(knn_accuracies)*100,2)}%', flush=True)

    # Naive Bayes
    nb = GaussianNB()
    nb_accuracies = train_meta_model(nb, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(nb_accuracies))
    # print(f'nb accuracy: {round(np.mean(nb_accuracies)*100,2)}%')

    # Decision Tree
    dt = DecisionTreeClassifier(random_state=0)
    dt_accuracies = train_meta_model(dt, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(dt_accuracies))
    # print(f'dt accuracy: {round(np.mean(dt_accuracies)*100,2)}%')

    # AdaBoost
    ab = AdaBoostClassifier(n_estimators=100, random_state=0)
    ab_accuracies = train_meta_model(ab, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(ab_accuracies))
    # print(f'ab accuracy: {round(np.mean(ab_accuracies)*100,2)}%')

    # Gradient Boosting
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    gb_accuracies = train_meta_model(gb, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(gb_accuracies))
    # print(f'gb accuracy: {round(np.mean(gb_accuracies)*100,2)}%')

    # Neural Network
    nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
    nn_accuracies = train_meta_model(nn, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(nn_accuracies))
    # print(f'nn accuracy: {round(np.mean(nn_accuracies)*100,2)}%')

    # Bagging
    bg = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=0)
    bg_accuracies = train_meta_model(bg, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(bg_accuracies))
    # print(f'bg accuracy: {round(np.mean(bg_accuracies)*100,2)}%')

    # Extra Trees
    et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
    et_accuracies = train_meta_model(et, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(et_accuracies))
    # print(f'et accuracy: {round(np.mean(et_accuracies)*100,2)}%')

    # Voting Classifier
    vc = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm), ('knn', knn), ('nb', nb), ('dt', dt), ('ab', ab), ('gb', gb), ('nn', nn), ('bg', bg), ('et', et)], voting='hard')
    vc_accuracies = train_meta_model(vc, iters, X, y, valid_datasets, maxes)
    results_arr.append(np.mean(vc_accuracies))
    # print(f'vc accuracy: {round(np.mean(vc_accuracies)*100,2)}%')

    results.append(results_arr)

Meta-datasets:   0%|          | 0/13 [00:00<?, ?it/s]

df_name: concept_metadataset.csv


Meta-datasets:   8%|▊         | 1/13 [17:42<3:32:26, 1062.22s/it]

df_name: clustering_metadataset.csv


Meta-datasets:  15%|█▌        | 2/13 [41:36<3:54:53, 1281.21s/it]

df_name: all_metadataset.csv


Meta-datasets:  23%|██▎       | 3/13 [2:30:12<10:11:54, 3671.49s/it]

df_name: general_metadataset.csv


Meta-datasets:  31%|███       | 4/13 [2:52:26<6:52:16, 2748.53s/it] 

df_name: model-based_metadataset.csv


Meta-datasets:  38%|███▊      | 5/13 [3:23:26<5:23:46, 2428.26s/it]

df_name: landmarking_metadataset.csv


Meta-datasets:  46%|████▌     | 6/13 [3:52:17<4:15:37, 2191.07s/it]

df_name: itemset_metadataset.csv


Meta-datasets:  54%|█████▍    | 7/13 [4:11:55<3:06:00, 1860.04s/it]

df_name: complexity_metadataset.csv


Meta-datasets:  62%|██████▏   | 8/13 [4:46:19<2:40:24, 1924.84s/it]

df_name: info-theory_metadataset.csv


Meta-datasets:  69%|██████▉   | 9/13 [5:15:20<2:04:29, 1867.31s/it]

df_name: d2v_pretrained_metadaset.csv


Meta-datasets:  77%|███████▋  | 10/13 [5:54:10<1:40:30, 2010.10s/it]

df_name: default_metadataset.csv


Meta-datasets:  85%|████████▍ | 11/13 [7:19:08<1:38:30, 2955.21s/it]

df_name: statistical_metadataset.csv


Meta-datasets:  92%|█████████▏| 12/13 [8:05:20<48:19, 2899.63s/it]  

df_name: relative_metadataset.csv


Meta-datasets: 100%|██████████| 13/13 [8:28:28<00:00, 2346.80s/it]


In [18]:
models = ["Metadataset", "RandomForest", "LogisticRegression", "SVC", "KNeighbors",
          "GaussianNB", "DecisionTree", "AdaBoost", "GradientBoosting",
          "MLP", "Bagging", "ExtraTrees",  "Voting"]

df = pd.DataFrame(results, columns=models)
df

Unnamed: 0,Metadataset,RandomForest,LogisticRegression,SVC,KNeighbors,GaussianNB,DecisionTree,AdaBoost,GradientBoosting,MLP,Bagging,ExtraTrees,Voting
0,concept_metadataset.csv,0.351271,0.332763,0.400492,0.286,0.282169,0.317627,0.320475,0.270051,0.371559,0.326915,0.346424,0.359085
1,clustering_metadataset.csv,0.400789,0.359331,0.39493,0.311754,0.204986,0.339563,0.365113,0.318718,0.381683,0.371796,0.384634,0.40607
2,all_metadataset.csv,0.406379,0.270295,0.37372,0.289439,0.242606,0.336886,0.358258,0.303886,0.158152,0.378333,0.42328,0.413955
3,general_metadataset.csv,0.430717,0.388476,0.415041,0.333772,0.198407,0.352834,0.320552,0.352607,0.384131,0.379814,0.390455,0.418228
4,model-based_metadataset.csv,0.411946,0.254585,0.405769,0.274338,0.270223,0.344562,0.373938,0.319038,0.178392,0.382477,0.410738,0.420715
5,landmarking_metadataset.csv,0.383639,0.387674,0.383958,0.366007,0.357069,0.336507,0.36241,0.281299,0.383236,0.376542,0.410146,0.408562
6,itemset_metadataset.csv,0.36816,0.392556,0.385667,0.307354,0.319271,0.332264,0.250271,0.326917,0.388514,0.335701,0.341389,0.370653
7,complexity_metadataset.csv,0.406571,0.250158,0.370444,0.358511,0.223579,0.341977,0.364053,0.358729,0.362436,0.387083,0.412323,0.413917
8,info-theory_metadataset.csv,0.391076,0.379545,0.394234,0.298359,0.272538,0.332697,0.337779,0.340545,0.384876,0.366041,0.386193,0.401083
9,d2v_pretrained_metadaset.csv,0.376028,0.310414,0.373621,0.285517,0.189641,0.290986,0.304124,0.321083,0.164448,0.311028,0.301517,0.350193


In [20]:
df.to_csv('results.csv')

In [None]:
df.columns