In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo 

# MAKING TRAIN/TEST/VAL SPLIT

In [13]:
 # fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

#Placing into just one dataframe
df = pd.concat([X,y], axis=1)

#Fixing the typos in the target
df['income'] = df['income'].str.replace('.', '', regex=True)

#identify non-numeric features
non_numeric_features = df.select_dtypes(exclude=['number', 'bool']).columns
numeric_features = df.select_dtypes(include =['number', 'bool']).columns

print("non-numeric features", non_numeric_features)
print("numeric features", numeric_features)

#label encode all non-numerical features
label_encoders = {}

for feature in non_numeric_features:
    label_encoders[feature] = LabelEncoder()
    df[feature] = label_encoders[feature].fit_transform(df[feature])

#Train test split (0.70, 0.12, 0.18)
df_train, df_temp = train_test_split(df, train_size=0.7, random_state=42)
df_val, df_test = train_test_split(df_temp, train_size=0.4, random_state=42)

print(df_train.shape[0])
print(df_val.shape[0])
print(df_test.shape[0])

non-numeric features Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')
numeric features Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
34189
5861
8792


In [6]:
#save csvs in a dataset folder in your directory
df_train.to_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\income\train.csv', index=False)
df_val.to_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\income\validation.csv', index=False)
df_test.to_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\income\test.csv', index=False)