In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
from ucimlrepo import fetch_ucirepo #install with pip install ucimlrepo
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets

#merging them back into a single dataframe
df = pd.concat([X, y], axis=1)

print(df['income'].value_counts())
# Combine similar labels
df['income'] = df['income'].str.strip('.').str.strip()  # Remove trailing periods and whitespace
df['income'] = df['income'].replace({'<=50K.': '<=50K', '>50K.': '>50K'})

# Check the updated counts - REALLY WEIRD
value_counts = df['income'].value_counts()
print(value_counts)


cat_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']

#dictionary of the encoders
labelencoders = {}

for feature in cat_columns:
    temp = LabelEncoder()
    labelencoders[str(feature)+"_ENCODER"] = temp.fit(df[feature])
    df[feature] = temp.transform(df[feature])


#Split based of ft-transformer paper: 26048 6513 16281
df_train, df_temp = train_test_split(df, train_size=26048, random_state=42)
df_val, df_test = train_test_split(df_temp, train_size=6513, random_state=42)

assert(df_train.shape[0] == 26048)
assert(df_val.shape[0] == 6513)
assert(df_test.shape[0] == 16281)

# #taking out the target
# train_target = df_train['income']
# val_target = df_val['income']
# test_target = df_test['income']

# df_train_new = df_train.drop(columns = ['income'])
# df_val_new = df_val.drop(columns=['income'])
# df_test_new = df_test.drop(columns=['income'])

# # stand = StandardScaler() #MUST SCALE IN EXPERIMENTS BC FT TRANSFORMER HANDLES CAT COLUMNS SPECIAL
# # stand.fit(df_train)
# # df_train_scaled = stand.transform(df_train)
# # df_val_scaled = stand.transform(df_val)
# # df_test_scaled = stand.transform(df_test)

# df_train = pd.DataFrame(df_train_new, columns=df_train.columns)
# df_val = pd.DataFrame(df_val_new, columns=df_val.columns)
# df_test = pd.DataFrame(df_test_new, columns=df_test.columns)

# train_target=train_target.reset_index(drop=True)
# val_target=val_target.reset_index(drop=True)
# test_target=test_target.reset_index(drop=True)

# df_train['income'] = train_target
# df_val['income'] = val_target
# df_test['income'] = test_target

# train_target

income
<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: count, dtype: int64
income
<=50K    37155
>50K     11687
Name: count, dtype: int64


In [3]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
29172,27,4,153475,9,13,4,12,1,4,0,0,0,40,39,0
28650,38,4,122076,15,10,2,14,0,4,1,7298,0,43,39,1
48319,39,5,206362,8,11,2,3,0,4,1,0,0,40,39,0
12232,32,4,137076,15,10,2,4,0,4,1,15024,0,60,39,1
47727,46,4,198660,11,9,0,8,1,4,1,0,0,40,39,0


In [4]:
df_val.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
15017,48,4,175221,11,9,2,7,0,4,1,0,0,40,39,0
30947,69,0,121136,11,9,2,0,0,4,1,0,0,13,39,0
42109,43,4,91316,12,14,4,10,1,4,1,0,0,60,39,1
23875,19,4,316797,5,4,2,6,3,4,1,0,0,45,26,0
48833,31,4,440129,11,9,2,3,0,4,1,0,0,40,39,0


In [5]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
20050,32,2,177566,15,10,3,10,1,4,1,0,0,50,11,0
1166,65,4,350498,9,13,2,1,0,4,1,10605,0,20,39,1
23024,23,4,197613,15,10,4,1,1,4,0,0,0,40,39,0
18173,45,6,123088,11,9,2,3,0,4,1,0,0,50,39,0
43997,21,4,116657,11,9,4,4,3,4,1,0,0,40,39,0


In [6]:
#save csvs
df_train.to_csv('/home/cscadmin/CyberResearch/CAT-Transformer/datasets/income/train.csv', index=False) #CHANGE PATHS TO PUT IN DATASET FOLDER SO IT DOES NOT PUSH
df_val.to_csv('/home/cscadmin/CyberResearch/CAT-Transformer/datasets/income/validation.csv', index=False)
df_test.to_csv('/home/cscadmin/CyberResearch/CAT-Transformer/datasets/income/test.csv', index=False)