In [None]:
import pandas as pd
import numpy as np
import os
path_to_dataset_folder = 'C:/Users/Dimiexe/Desktop/DATAsets'
target_test_path = path_to_dataset_folder + '/DATA/Target_feature_test_CSVs'
manual_meta_feature_path = path_to_dataset_folder + '/DATA/manual_metafeatures_for_test_CSVs'

In [None]:
#Definitions of functions for >>>MANUAL<<< computation of general and statistical
#type metafeatures of the given dataset.

def append_mean_and_std(vec, values):
    vec.append(np.mean(values))
    vec.append(np.std(values))

def manual_mfe_extraction(data):    
    meta_vector = []
    x = df.loc[:,df.columns[0:-1]]
    y = df.loc[:,df.columns[-1]]
    
    # ------------ General metafeatures ------------
    # 1. number of instances
    nr_inst = x.shape[0]
    meta_vector.append(nr_inst)
    # 2. number of features
    nr_attr = x.shape[1]
    meta_vector.append(nr_attr)
    # 3. ratio between instances and features
    inst_to_attr = nr_inst / nr_attr
    meta_vector.append(inst_to_attr)
    # 4. number of categorical features
    nr_cat = 0
    # 5. number of numeric features
    nr_num = 0
    
    uniques_values_per_feature = x.apply(pd.Series.nunique)
    classes = []
    for i, feature in enumerate(x.columns):
        if x.dtypes[i] == 'object':
            nr_cat +=1
            classes.append(uniques_values_per_feature[i])
        else:
            nr_num +=1
    meta_vector.append(nr_cat)
    meta_vector.append(nr_num)
    # 6. categorical to numerical ratio
    cat_to_num = nr_cat / nr_num
    meta_vector.append(cat_to_num)
    # 7. & 8. number of distinct classes (mean and std)
    if len(classes) == 0:
        classes = [0]
    append_mean_and_std(meta_vector, classes)
    
    # ------------ Statistical metafeatures ------------
    cor_matrix = x.corr(method = 'pearson').abs()
    cov_matrix = x.cov().abs()
    cor_flat = []
    cov_flat = []
    for i in range(cor_matrix.shape[0]):
        for j in range(cor_matrix.shape[1]):
            if j > i:
                cor_flat.append(cor_matrix.iloc[i,j])
                cov_flat.append(cov_matrix.iloc[i,j])
    if len(cor_flat) == 0: #if there are less than 2 numeric features in dataset.
        cor_flat = [0]
        cov_flat = [0]
    # 9. & 10. absolute pairwise correlations (mean and std)
    append_mean_and_std(meta_vector, cor_flat)
    # 11. & 12. absolute pairwise covariances (mean and std)
    append_mean_and_std(meta_vector, cov_flat)
    # 13. & 14. kurtosis of features (mean and std)
    kurt = x.kurtosis(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, kurt)
    # 15. & 16. mean absolute deviation of features (mean and std)
    mad = x.mad(axis = 0)
    append_mean_and_std(meta_vector, mad)
    # 17. & 18. maximum value of features (mean and std)
    maxes = x.max(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, maxes)
    # 19. & 20. mean value of features (mean and std)
    means = x.mean(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, means)
    # 21. & 22. median of features (mean and std)
    medians = x.median(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, medians)
    # 23. & 24. minimum value of features (mean and std)
    mins = x.min(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, mins)
    # 25. & 26. standard deviation of value of features (mean and std)
    stds = x.std(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, stds)
    # 27. & 28. skewness of features (mean and std)
    skews = x.skew(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, skews)
    # 29. & 30. variance of features (mean and std)
    variances = x.var(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, variances)
    
    return meta_vector

In [None]:
# The metafeature vector is saved in the file located in 
# "../DATAsets/DATA/metafeatures_for_test_CSVs" dir.
datasets = os.listdir(target_test_path)
extra_ignore = ['COMBO17%e.W462FE.csv', 'SkillCraft1_Dataset%WorkersMade.csv'] #These two only have 1 node in their trees, the root.
for file in np.sort(np.setdiff1d(datasets, extra_ignore, assume_unique=True)):
    print('proccessing file: ' + file)
    for method in ['drop','mean','median']:
        if not os.path.isfile(manual_meta_feature_path + '/' + method + '/' + file):
            print('method: ' + method)
            df = pd.read_table(target_test_path + '/'+ file,sep=',')
            metafeatures = manual_mfe_extraction(df)

            with open(manual_meta_feature_path + '/' + method + '/' + file,'w' ) as f:
                for i,x in enumerate(metafeatures):
                    if i < len(metafeatures) - 1:
                        f.write(str(x)+',')
                    else:
                        f.write(str(x))