### Import the necessary libraries

In [1]:
import openml
import pandas as pd
from ucimlrepo import fetch_ucirepo
import pickle
from sklearn.impute import SimpleImputer
import numpy as np
import os

### Collecting all the datasets

#### Collecting the classification datasets

Collect the openml Data

In [2]:
# Names and IDs of the datasets
datasets = ['Diabetes', 'madelon', 'gina', 'kc1', 'Amazon_employee_access', 'arcene', 'labor']
ids = [37, 1485, 41158, 1067, 4135, 1458, 4]

# Dictionaries to store data
Independent_Data = {}
Dependent_Data = {}
Data = {}

# Fetch datasets and split into X and y
for name, id in zip(datasets, ids):
    dataset = openml.datasets.get_dataset(id)
    df, _, _, _ = dataset.get_data(dataset_format="dataframe")
    
    # Special handling for the 'gina' dataset where dependent variable is the first column
    if name in ['gina']:
        y = df.iloc[:, 0]  # First column as the dependent variable
        X = df.iloc[:, 1:]  # Remaining columns as independent variables
    elif name in ['kc1']:
        X = df.drop('defects', axis=1)  # This will create a DataFrame 'X' by dropping the column 'defects'
        y = df['defects']  # This will create a Series 'y' containing the values of the column 'defects'

    else:
        y = df.iloc[:, -1]  # Last column as the dependent variable
        X = df.iloc[:, :-1]  # All columns except the last as independent variables
    
    # Store dataframes in dictionaries with appropriate names
    Independent_Data[name] = X
    Dependent_Data[name] = y
    Data[name] = df

Collect the UCI ML Repository Datasets

In [3]:
# Names and IDs of the datasets
datasets = ['Statlog_German_Credit_Data', 'Ionosphere', 'Connectionist_Bench_Sonar_Mines_vs_Rocks', 
            'Statlog_Australian_Credit_Approval', 'Fertility', 'Spambase', 'Blood_Transfusion_Service_Center', 
            'EEG_Eye_State', 'Iris', 'Tic_Tac_Toe_Endgame', 
            'Balance_Scale', 'Hepatitis', 'Credit_Approval']
ids = [144, 52, 151, 143, 244, 94, 176, 264, 53, 101, 12, 46, 27]

# Fetch datasets and split into X and y
for name, id in zip(datasets, ids):
    # Fetch the dataset using the UCI ML Repository
    dataset = fetch_ucirepo(id=id)
    
    # Get X and y directly from dataset structure
    X = dataset.data.features
    y = dataset.data.targets
    
    # Combine X and y for the full dataframe
    df = pd.concat([X, y], axis=1)
    
    # Store dataframes in dictionaries with appropriate names
    Independent_Data[name] = X
    Dependent_Data[name] = y
    Data[name] = df

# Now we have:
# Independent_Data: dictionary containing all independent data dataframes, keyed by dataset names
# Dependent_Data: dictionary containing all dependent data dataframes, keyed by dataset names


#### Collecting the regression data

OpenML Data

In [4]:
# Names and IDs of the datasets
datasets = ['fri_c3_1000_50', 'fri_c2_1000_25', 'fri_c4_500_50', 'fri_c4_1000_50', 'fri_c1_1000_25', 'fri_c1_500_50', 'fri_c3_1000_25', 'auto93','pyrim','autoPrice',
            'boston']
ids = [618, 589, 616, 607, 620, 637, 586, 569, 217,207,531]

# Fetch datasets and split into X and y
for name, id in zip(datasets, ids):
    dataset = openml.datasets.get_dataset(id)
    df, _, _, _ = dataset.get_data(dataset_format="dataframe")
    
    y = df.iloc[:, -1]  # Last column as the dependent variable
    X = df.iloc[:, :-1]  # All columns except the last as independent variables
    
    # Store dataframes in dictionaries with appropriate names
    Independent_Data[name] = X
    Dependent_Data[name] = y
    Data[name] = df

Collect UCI ML Repo Data

In [5]:
# Names and IDs of the datasets
datasets = ['Concrete_Compressive_Strength', 'Auto_MPG', 'Forest Fires', 'Servo','Airfoil_Self_Noise','Wine_Quality']
ids = [165, 9, 162, 87,291,186]

# Fetch datasets and split into X and y
for name, id in zip(datasets, ids):
    # Fetch the dataset using the UCI ML Repository
    dataset = fetch_ucirepo(id=id)
    
    # Get X and y directly from dataset structure
    X = dataset.data.features
    y = dataset.data.targets
    
    # Combine X and y for the full dataframe
    df = pd.concat([X, y], axis=1)
    
    # Store dataframes in dictionaries with appropriate names
    Independent_Data[name] = X
    Dependent_Data[name] = y
    Data[name] = df

Collect Kaggle Data

In [6]:
# Path to the directory containing the CSV files
directory_path = 'Data/Kaggle Data'

# Loop through each file in the directory
for file in os.listdir(directory_path):
    if file.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory_path, file)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Get the file name without the extension to use as the dictionary key
        key_name = os.path.splitext(file)[0]
        
        # Store all columns except the last in the Independent_Data dictionary
        Independent_Data[key_name] = df.iloc[:, :-1]
        
        # Store the last column in the Dependent_Data dictionary
        Dependent_Data[key_name] = df.iloc[:, -1]

        Data[name] = df

### Imputing Missing Values

Identifying dataframes with missing values

In [7]:
for name, df in Data.items():
    # Check if any element is missing in the dataframe
    if df.isnull().any().any():
        print(f"The dataframe '{name}' has missing values.")

The dataframe 'labor' has missing values.
The dataframe 'Hepatitis' has missing values.
The dataframe 'Credit_Approval' has missing values.
The dataframe 'auto93' has missing values.
The dataframe 'Auto_MPG' has missing values.


Imputing the missing values in the Hepatitis DataFrame

In [8]:
hepatitis_df = Independent_Data['Hepatitis']

# Columns categorized by type
categorical_columns = ['Steroid', 'Fatigue', 'Malaise', 'Anorexia', 'Liver Big', 'Liver Firm',
                       'Spleen Palpable', 'Spiders', 'Ascites', 'Varices', 'Histology', 'Sex']
integer_columns = ['Alk Phosphate', 'Sgot', 'Albumin', 'Protime']
continuous_columns = ['Bilirubin']

# Impute missing values for categorical columns with the mode
for column in categorical_columns:
    if hepatitis_df[column].isnull().any():
        mode_value = hepatitis_df[column].mode()[0]  # Get the mode value for the column
        hepatitis_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode

# Impute missing values for integer columns with the mode
for column in integer_columns:
    if hepatitis_df[column].isnull().any():
        mode_value = hepatitis_df[column].mode()[0]  # Get the mode value for the column
        hepatitis_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode

# Impute missing values for continuous columns with the mean
for column in continuous_columns:
    if hepatitis_df[column].isnull().any():
        mean_value = hepatitis_df[column].mean()  # Get the mean value for the column
        hepatitis_df[column].fillna(mean_value, inplace=True)  # Fill missing values with the mean

# Update the dictionary with the modified dataframe
Independent_Data['Hepatitis'] = hepatitis_df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hepatitis_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hepatitis_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode
The behavior will change in pandas 3.0. This inpla

Imputing the missing values in Credit Approval

In [9]:
credit_approval_df = Independent_Data['Credit_Approval']

# List of categorical and continuous columns
categorical_columns = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']
continuous_columns = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

# Impute missing values for categorical columns with the mode
for column in categorical_columns:
    if credit_approval_df[column].isnull().any():
        mode_value = credit_approval_df[column].mode()[0]  # Get the mode value for the column
        credit_approval_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode

# Impute missing values for continuous columns with the mean
for column in continuous_columns:
    if credit_approval_df[column].isnull().any():
        mean_value = credit_approval_df[column].mean()  # Get the mean value for the column
        credit_approval_df[column].fillna(mean_value, inplace=True)  # Fill missing values with the mean

# Update the dictionary with the modified dataframe
Independent_Data['Credit_Approval'] = credit_approval_df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  credit_approval_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  credit_approval_df[column].fillna(mean_value, inplace=True)  # Fill missing values with the mean


Imputing the missing values in the labor dataset

In [10]:
# Assuming you already have the dictionary containing the 'labor' key with its DataFrame
labor_df = Independent_Data['labor']

# List of categorical and continuous columns
categorical_columns = ['duration', 'cost-of-living-adjustment', 'working-hours', 'pension', 
                       'standby-pay', 'shift-differential', 'education-allowance', 
                       'statutory-holidays', 'vacation', 'longterm-disability-assistance', 
                       'contribution-to-dental-plan', 'bereavement-assistance', 
                       'contribution-to-health-plan']
continuous_columns = ['wage-increase-first-year', 'wage-increase-second-year', 'wage-increase-third-year']

# Impute missing values for categorical columns with the mode
for column in categorical_columns:
    if labor_df[column].isnull().any():  # Check if there are any missing values
        mode_value = labor_df[column].mode()[0]  # Get the mode value for the column
        labor_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode

# Impute missing values for continuous columns with the mean
for column in continuous_columns:
    if labor_df[column].isnull().any():  # Check if there are any missing values
        mean_value = labor_df[column].mean()  # Get the mean value for the column
        labor_df[column].fillna(mean_value, inplace=True)  # Fill missing values with the mean

# Update the dictionary with the modified DataFrame
Independent_Data['labor'] = labor_df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labor_df[column].fillna(mode_value, inplace=True)  # Fill missing values with the mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labor_df[column].fillna(mean_value, inplace=True)  # Fill missing values with the mean


Imputing the missing values of auto93

In [11]:
df = Independent_Data['auto93']

# List of numeric and nominal (categorical) features based on provided information
numeric_features = [
    'City_MPG', 'Highway_MPG', 'Number_of_cylinders',
    'Engine_size', 'Horsepower', 'RPM', 'Engine_revolutions_per_mile',
    'Fuel_tank_capacity', 'Passenger_capacity', 'Length', 'Wheelbase',
    'Width', 'U-turn_space', 'Rear_seat_room', 'Luggage_capacity', 'Weight'
]

nominal_features = [
    'Manufacturer', 'Type', 'Air_Bags_standard', 'Drive_train_type',
    'Manual_transmission_available', 'Domestic'
]

# Converting numeric features to float64
df[numeric_features] = df[numeric_features].astype('float64')

# Converting nominal features to category
df[nominal_features] = df[nominal_features].astype('category')

# Imputing missing values in numeric features with the mean
for column in numeric_features:
    if df[column].isnull().any():
        df[column].fillna(df[column].mean(), inplace=True)

# Imputing missing values in categorical features with the mode
for column in nominal_features:
    if df[column].isnull().any():
        mode_value = df[column].mode()[0]  # Getting the mode value of the column
        df[column].fillna(mode_value, inplace=True)

Independent_Data['auto93'] = df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)


Imputing the missing values of Auto_MPG

In [12]:
# Assuming Independent_Data is your dictionary containing the DataFrame
df = Independent_Data['Auto_MPG']

# Check if there are missing values in the 'horsepower' column
if df['horsepower'].isnull().any():
    # Calculate the mean of the 'horsepower' column, excluding NaN values
    mean_value = df['horsepower'].mean()
    
    # Impute missing values in the 'horsepower' column with the mean
    df['horsepower'].fillna(mean_value, inplace=True)

# Save the modified DataFrame back to the dictionary
Independent_Data['Auto_MPG'] = df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].fillna(mean_value, inplace=True)


### Converting series to dataframes

In [13]:
# Convert Series to DataFrame
for key, value in Dependent_Data.items():
    if isinstance(value, pd.Series):
        Dependent_Data[key] = value.to_frame()

### Changing Categorical Column Types to category and Int/Cts to float64

In [14]:
#Statlog German Credit Data
Dependent_Data['Statlog_German_Credit_Data']['class'] = Dependent_Data['Statlog_German_Credit_Data']['class'].astype('category')

#Statlog_Australian_Credit_Approval

# List of categorical and continuous variables for 'Statlog_Australian_Credit_Approval'
categorical_columns = ['A1', 'A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12']
continuous_columns = ['A2', 'A3', 'A7', 'A10', 'A13', 'A14']

# Convert categorical variables to 'category' type
for column in categorical_columns:
    if column in Independent_Data['Statlog_Australian_Credit_Approval'].columns:
        Independent_Data['Statlog_Australian_Credit_Approval'][column] = Independent_Data['Statlog_Australian_Credit_Approval'][column].astype('category')

# Convert continuous variables to 'float64' type
for column in continuous_columns:
    if column in Independent_Data['Statlog_Australian_Credit_Approval'].columns:
        Independent_Data['Statlog_Australian_Credit_Approval'][column] = Independent_Data['Statlog_Australian_Credit_Approval'][column].astype('float64')

Dependent_Data['Statlog_Australian_Credit_Approval']['A15'] = Dependent_Data['Statlog_Australian_Credit_Approval']['A15'].astype('category')

#Fertility Data

# Define the categorical and numeric variables for 'Fertility'
categorical_columns = ['high_fevers', 'alcohol', 'smoking', 'child_diseases', 'accident', 'surgical_intervention']
numeric_columns = ['season', 'age', 'hrs_sitting']  # These include both continuous and integer variables

# Convert categorical variables to 'category' type
for column in categorical_columns:
    if column in Independent_Data['Fertility'].columns:
        Independent_Data['Fertility'][column] = Independent_Data['Fertility'][column].astype('category')

# Convert numeric variables to 'float64' type
for column in numeric_columns:
    if column in Independent_Data['Fertility'].columns:
        Independent_Data['Fertility'][column] = Independent_Data['Fertility'][column].astype('float64')

#Spambase
Dependent_Data['Spambase']['Class'] = Dependent_Data['Spambase']['Class'].astype('category')

#Blood_Transfusion_Service_Center
Dependent_Data['Blood_Transfusion_Service_Center']['Donated_Blood'] = Dependent_Data['Blood_Transfusion_Service_Center']['Donated_Blood'].astype('category')

#Gina
# Convert all columns in the DataFrame to 'float64' data type
for column in Independent_Data['gina'].columns:
    Independent_Data['gina'][column] = Independent_Data['gina'][column].astype('float64')

#Arcene
# Convert all columns in the DataFrame to 'float64' data type
for column in Independent_Data['arcene'].columns:
    Independent_Data['arcene'][column] = Independent_Data['arcene'][column].astype('float64')

#EEG_Eye_State
Dependent_Data['EEG_Eye_State']['eyeDetection'] = Dependent_Data['EEG_Eye_State']['eyeDetection'].astype('category')

#kc1
# Convert all columns in the DataFrame to 'float64' data type
for column in Independent_Data['kc1'].columns:
    Independent_Data['kc1'][column] = Independent_Data['kc1'][column].astype('float64')


Dependent_Data['kc1']['defects'] = Dependent_Data['kc1']['defects'].astype('category')

#Airfoil_Self_Noise

# Convert 'attack-angle' to category
Independent_Data['Airfoil_Self_Noise']['attack-angle'] = Independent_Data['Airfoil_Self_Noise']['attack-angle'].astype('category')

# Convert all other columns to float64
for column in Independent_Data['Airfoil_Self_Noise'].columns:
    if column != 'attack-angle':  # Skip the 'attack-angle' column
        Independent_Data['Airfoil_Self_Noise'][column] = Independent_Data['Airfoil_Self_Noise'][column].astype('float64')


Dependent_Data['Airfoil_Self_Noise']['scaled-sound-pressure'] = Dependent_Data['Airfoil_Self_Noise']['scaled-sound-pressure'].astype('float64')

#Balance_Scale
# Convert all columns in the DataFrame to 'category' data type
for column in Independent_Data['Balance_Scale'].columns:
    Independent_Data['Balance_Scale'][column] = Independent_Data['Balance_Scale'][column].astype('category')

# Define categorical and continuous variables for 'Hepatitis'
categorical_variables = [
    'Sex', 'Steroid', 'Antivirals', 'Fatigue', 'Malaise', 
    'Anorexia', 'Liver Big', 'Liver Firm', 'Spleen Palpable', 
    'Spiders', 'Ascites', 'Varices'
]
continuous_variables = [
    'Bilirubin', 'Age', 'Alk Phosphate', 'Sgot', 'Albumin', 'Protime', 'Histology'
]

# Convert categorical variables to 'category'
for column in categorical_variables:
    if column in Independent_Data['Hepatitis'].columns:
        Independent_Data['Hepatitis'][column] = Independent_Data['Hepatitis'][column].astype('category')

# Convert continuous variables to 'float64'
for column in continuous_variables:
    if column in Independent_Data['Hepatitis'].columns:
        Independent_Data['Hepatitis'][column] = Independent_Data['Hepatitis'][column].astype('float64')

Dependent_Data['Hepatitis']['Class'] = Dependent_Data['Hepatitis']['Class'].astype('category')

#pyrim
Independent_Data['pyrim']['p3_pi_acceptor'] = Independent_Data['pyrim']['p3_pi_acceptor'].astype('float64')

#Auto_MPG
Independent_Data['Auto_MPG']['weight'] = Independent_Data['Auto_MPG']['weight'].astype('float64')

#autoPrice
Independent_Data['autoPrice']['horsepower'] = Independent_Data['autoPrice']['horsepower'].astype('float64')
Independent_Data['autoPrice']['city-mpg'] = Independent_Data['autoPrice']['city-mpg'].astype('float64')
Independent_Data['autoPrice']['highway-mpg'] = Independent_Data['autoPrice']['highway-mpg'].astype('float64')


### Checking for values with infinity

In [15]:
# Assuming the dictionary 'Independent_Data' is defined and contains dataframes
dataframes_with_infinities = {}

# Check each dataframe for infinity or negative infinity values in numeric columns
for name, df in Independent_Data.items():
    # Select columns that are not of type 'object' or 'category'
    numeric_df = df.select_dtypes(include=[np.number])
    if np.isinf(numeric_df.values).any():  # Check if any element is +/- infinity
        dataframes_with_infinities[name] = df

# Output the names of dataframes containing infinity values
print("Dataframes with infinity or negative infinity values:", list(dataframes_with_infinities.keys()))


Dataframes with infinity or negative infinity values: []


Checking unique data types of regression datasets

In [16]:
regression_datasets = [
    'fri_c3_1000_50', 'fri_c2_1000_25', 'fri_c4_500_50', 'fri_c4_1000_50', 
    'fri_c1_1000_25', 'fri_c1_500_50', 'fri_c3_1000_25', 'auto93', 'pyrim', 
    'autoPrice', 'boston', 'Concrete_Compressive_Strength', 'Auto_MPG', 
    'Forest Fires', 'Servo', 'Airfoil_Self_Noise', 'Wine_Quality', 
    'BodyFat', 'California_Housing', 'Quake'
]

for name in regression_datasets:
    if name in Independent_Data:
        df = Independent_Data[name]
        unique_dtypes = df.dtypes.unique()
        print(f"{name}: {unique_dtypes}")
    else:
        print(f"{name}: Not found in Independent_Data")


fri_c3_1000_50: [dtype('float64')]
fri_c2_1000_25: [dtype('float64')]
fri_c4_500_50: [dtype('float64')]
fri_c4_1000_50: [dtype('float64')]
fri_c1_1000_25: [dtype('float64')]
fri_c1_500_50: [dtype('float64')]
fri_c3_1000_25: [dtype('float64')]
auto93: [CategoricalDtype(categories=['Acura', 'Audi', 'BMW', 'Buick', 'Cadillac', 'Chevrolet',
                   'Chrysler', 'Dodge', 'Eagle', 'Ford', 'Geo', 'Honda',
                   'Hyundai', 'Infiniti', 'Lexus', 'Lincoln', 'Mazda',
                   'Mercedes-Benz', 'Mercury', 'Mitsubishi', 'Nissan',
                   'Oldsmobile', 'Plymouth', 'Pontiac', 'Saab', 'Saturn',
                   'Subaru', 'Suzuki', 'Toyota', 'Volkswagen', 'Volvo'],
 , ordered=True, categories_dtype=object)
 CategoricalDtype(categories=['Small', 'Midsize', 'Compact', 'Large', 'Sporty', 'Van'], ordered=True, categories_dtype=object)
 dtype('float64')
 CategoricalDtype(categories=['0', '2', '1'], ordered=True, categories_dtype=object)
 CategoricalDtype(categorie

### Exporting the data for use

In [17]:
# Serialize the dictionaries themselves
pickle.dump(Independent_Data, open('Data/Independent_Data_dictionary.pkl', 'wb'))
pickle.dump(Dependent_Data, open('Data/Dependent_Data_dictionary.pkl', 'wb'))