In [2]:
import os
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.preprocessing import LabelEncoder

In [3]:
current_directory = os.getcwd()
print("current directory:", current_directory)

current directory: c:\Ronald\uOttawa\CSI 6900\Metallic-main\creating_metafeatures


In [4]:
base_path = " c:/Ronald/uOttawa/CSI 6900/Metallic-main"

In [5]:
relative_path = os.path.relpath(current_directory, base_path)

In [6]:
relative_path

'..\\..\\..\\..\\..'

In [16]:
source_directory = './additional_dataset/'
target_directory = './processed_dataset/'


if not os.path.exists(target_directory):
    os.makedirs(target_directory)


for filename in os.listdir(source_directory):
    if filename.endswith('.arff'):

        file_path = os.path.join(source_directory, filename)
        target_file_path = os.path.join(target_directory, filename.replace('.arff', '.csv'))

 
        data, _ = arff.loadarff(file_path)

        df = pd.DataFrame(data)

        substrings_to_remove = ['year', 'month', 'number', 'id', 'timestamp', 'index', 'text', 'period', 'counter']
        df.drop(columns=[col for col in df.columns if any(substring in col.lower() for substring in substrings_to_remove)], inplace=True, errors='ignore')

        # remove 'b' and ' ' in original dataset
        # df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
        df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

        # missing values
        if df.isnull().values.any():
            print(f"{filename} has missing values")
            for column in df.columns:
                if df[column].dtype == 'object':
                    # mode for categorical variables
                    most_common = df[column].mode()[0]
                    df[column].fillna(most_common, inplace=True)
                else:
                    # mean for numeric variables
                    mean_value = df[column].mean()
                    df[column].fillna(mean_value, inplace=True)
                    df[column] = df[column].round(1)

        # If target column is not the last column, move to the last
        class_column = None
        for col in df.columns:
            if 'class' in col.lower() or col == 'Home/Away' in col:
                class_column = col
                break
        if class_column and df.columns[-1] != class_column:
            class_col = df[class_column]
            df = df.drop(columns=[class_column])
            df['cls'] = class_col
        else:
            df.rename(columns={df.columns[-1]: 'cls'}, inplace=True)
        

        # Assign numeric labels based on class frequency
        class_counts = df['cls'].value_counts(ascending=False)
        class_mapping = {cls: i for i, cls in enumerate(class_counts.index)}
        df['cls'] = df['cls'].map(class_mapping)

        
        label_encoders = {}
        for column in df.columns:
            if df[column].dtype == 'object':
                le = LabelEncoder()
                df[column] = le.fit_transform(df[column])
                label_encoders[column] = le


        df.to_csv(target_file_path, index=False)

        print(f"Processed {filename}")
        print(df['cls'].value_counts())


Processed analcatdata_authorship.arff
cls
0    317
1    296
2    173
3     55
Name: count, dtype: int64
Processed analcatdata_dmft.arff
cls
0    155
1    136
2    132
3    127
4    124
5    123
Name: count, dtype: int64
Processed bodyfat.arff
cls
0    128
1    124
Name: count, dtype: int64
Processed chscase_geyser1.arff
cls
0    134
1     88
Name: count, dtype: int64
Processed cloud.arff
cls
0    76
1    32
Name: count, dtype: int64
Processed confidence.arff
cls
0    60
1    12
Name: count, dtype: int64
Processed dataset_113_primary-tumor.arff
cls
0     84
1     39
2     29
3     28
4     24
5     24
6     20
7     16
8     14
9     14
10    10
11     9
12     7
13     6
14     6
15     2
16     2
17     2
18     1
19     1
20     1
Name: count, dtype: int64
Processed dataset_11_balance-scale.arff
cls
0    288
1    288
2     49
Name: count, dtype: int64
dataset_190_braziltourism.arff has missing values
Processed dataset_190_braziltourism.arff
cls
0    318
1     64
2     16
3      7
4  

In [17]:
target_directory = './processed_dataset/'

rows_list = []
features_list = []
classes_list = []
minority_cases_list = []
imbalance_ratio_list = []


for filename in os.listdir(target_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(target_directory, filename)


        df = pd.read_csv(file_path)

        num_rows = len(df)
        num_features = len(df.columns) - 1  
        num_classes = len(df.iloc[:, -1].unique())
        class_counts = df.iloc[:, -1].value_counts()
        minority_cases = class_counts.min()
        imbalance_ratio = class_counts.max() / class_counts.min()


        rows_list.append(num_rows)
        features_list.append(num_features)
        classes_list.append(num_classes)
        minority_cases_list.append(minority_cases)
        imbalance_ratio_list.append(imbalance_ratio)


        print(f"Dataset: {filename}")
        print(f"Rows: {num_rows}")
        print(f"Features: {num_features}")
        print(f"Classes: {num_classes}")
        print(f"Minority Cases: {minority_cases}")
        print(f"Imbalance Ratio: {imbalance_ratio:.2f}\n")


def calculate_statistics(data_list):
    return np.min(data_list), np.max(data_list), np.median(data_list), np.mean(data_list), np.std(data_list)

rows_stats = calculate_statistics(rows_list)
features_stats = calculate_statistics(features_list)
classes_stats = calculate_statistics(classes_list)
minority_cases_stats = calculate_statistics(minority_cases_list)
imbalance_ratio_stats = calculate_statistics(imbalance_ratio_list)

print(f"Rows: Min={rows_stats[0]}, Max={rows_stats[1]}, Median={rows_stats[2]}, Mean={rows_stats[3]:.2f}, Std={rows_stats[4]:.2f}")
print(f"Features: Min={features_stats[0]}, Max={features_stats[1]}, Median={features_stats[2]}, Mean={features_stats[3]:.2f}, Std={features_stats[4]:.2f}")
print(f"Classes: Min={classes_stats[0]}, Max={classes_stats[1]}, Median={classes_stats[2]}, Mean={classes_stats[3]:.2f}, Std={classes_stats[4]:.2f}")
print(f"Minority Cases: Min={minority_cases_stats[0]}, Max={minority_cases_stats[1]}, Median={minority_cases_stats[2]}, Mean={minority_cases_stats[3]:.2f}, Std={minority_cases_stats[4]:.2f}")
print(f"Imbalance Ratio: Min={imbalance_ratio_stats[0]:.2f}, Max={imbalance_ratio_stats[1]:.2f}, Median={imbalance_ratio_stats[2]:.2f}, Mean={imbalance_ratio_stats[3]:.2f}, Std={imbalance_ratio_stats[4]:.2f}")

Dataset: analcatdata_authorship.csv
Rows: 841
Features: 69
Classes: 4
Minority Cases: 55
Imbalance Ratio: 5.76

Dataset: analcatdata_dmft.csv
Rows: 797
Features: 4
Classes: 6
Minority Cases: 123
Imbalance Ratio: 1.26

Dataset: Australian.csv
Rows: 690
Features: 14
Classes: 2
Minority Cases: 307
Imbalance Ratio: 1.25

Dataset: autos.csv
Rows: 205
Features: 24
Classes: 6
Minority Cases: 3
Imbalance Ratio: 22.33

Dataset: autoUniv-au6-750.csv
Rows: 750
Features: 40
Classes: 8
Minority Cases: 57
Imbalance Ratio: 2.89

Dataset: balance-scale.csv
Rows: 625
Features: 4
Classes: 3
Minority Cases: 49
Imbalance Ratio: 5.88

Dataset: blood-transfusion-service-center.csv
Rows: 748
Features: 4
Classes: 2
Minority Cases: 178
Imbalance Ratio: 3.20

Dataset: bodyfat.csv
Rows: 252
Features: 14
Classes: 2
Minority Cases: 124
Imbalance Ratio: 1.03

Dataset: braziltourism.csv
Rows: 412
Features: 8
Classes: 7
Minority Cases: 1
Imbalance Ratio: 318.00

Dataset: chscase_geyser1.csv
Rows: 222
Features: 2
Clas

In [7]:
source_directory = './test_dataset/'
target_directory = './test_dataset/'


if not os.path.exists(target_directory):
    os.makedirs(target_directory)


for filename in os.listdir(source_directory):
    if filename.endswith('.arff'):

        file_path = os.path.join(source_directory, filename)
        target_file_path = os.path.join(target_directory, filename.replace('.arff', '.csv'))

 
        data, _ = arff.loadarff(file_path)

        df = pd.DataFrame(data)

        substrings_to_remove = ['year', 'month', 'number', 'id', 'timestamp', 'index', 'text', 'period', 'counter']
        df.drop(columns=[col for col in df.columns if any(substring in col.lower() for substring in substrings_to_remove)], inplace=True, errors='ignore')

        # remove 'b' and ' ' in original dataset
        # df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
        df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

        # missing values
        if df.isnull().values.any():
            print(f"{filename} has missing values")
            for column in df.columns:
                if df[column].dtype == 'object':
                    # mode for categorical variables
                    most_common = df[column].mode()[0]
                    df[column].fillna(most_common, inplace=True)
                else:
                    # mean for numeric variables
                    mean_value = df[column].mean()
                    df[column].fillna(mean_value, inplace=True)
                    df[column] = df[column].round(1)

        # If target column is not the last column, move to the last
        class_column = None
        for col in df.columns:
            if 'class' in col.lower() or col == 'Home/Away' in col:
                class_column = col
                break
        if class_column and df.columns[-1] != class_column:
            class_col = df[class_column]
            df = df.drop(columns=[class_column])
            df['cls'] = class_col
        else:
            df.rename(columns={df.columns[-1]: 'cls'}, inplace=True)
        

        # Assign numeric labels based on class frequency
        class_counts = df['cls'].value_counts(ascending=False)
        class_mapping = {cls: i for i, cls in enumerate(class_counts.index)}
        df['cls'] = df['cls'].map(class_mapping)

        
        label_encoders = {}
        for column in df.columns:
            if df[column].dtype == 'object':
                le = LabelEncoder()
                df[column] = le.fit_transform(df[column])
                label_encoders[column] = le


        df.to_csv(target_file_path, index=False)

        print(f"Processed {filename}")
        print(df['cls'].value_counts())


Processed hill-valley.arff
cls
0    606
1    606
Name: count, dtype: int64
Processed steel-plates-fault.arff
cls
0    1268
1     673
Name: count, dtype: int64
