In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt

In [10]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets

#merging them back into a single dataframe
df = pd.concat([X, y], axis=1)

print(df['income'].value_counts())
# Combine similar labels
df['income'] = df['income'].str.strip('.').str.strip()  # Remove trailing periods and whitespace
df['income'] = df['income'].replace({'<=50K.': '<=50K', '>50K.': '>50K'})

# Check the updated counts - REALLY WEIRD
value_counts = df['income'].value_counts()
print(value_counts)


cat_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']

#dictionary of the encoders
labelencoders = {}

for feature in cat_columns:
    temp = LabelEncoder()
    labelencoders[str(feature)+"_ENCODER"] = temp.fit(df[feature])
    df[feature] = temp.transform(df[feature])


#Split based of ft-transformer paper: 26048 6513 16281
df_train, df_temp = train_test_split(df, train_size=26048, random_state=42)
df_val, df_test = train_test_split(df_temp, train_size=6513, random_state=42)

assert(df_train.shape[0] == 26048)
assert(df_val.shape[0] == 6513)
assert(df_test.shape[0] == 16281)

#taking out the target
train_target = df_train['income']
val_target = df_val['income']
test_target = df_test['income']

df_train.drop(columns = ['income'], inplace = True)
df_val.drop(columns=['income'], inplace = True)
df_test.drop(columns=['income'], inplace=True)

stand = StandardScaler()
stand.fit(df_train)
df_train_scaled = stand.transform(df_train)
df_val_scaled = stand.transform(df_val)
df_test_scaled = stand.transform(df_test)

df_train = pd.DataFrame(df_train_scaled, columns=df_train.columns)
df_val = pd.DataFrame(df_val_scaled, columns=df_val.columns)
df_test = pd.DataFrame(df_test_scaled, columns=df_test.columns)

train_target=train_target.reset_index(drop=True)
val_target=val_target.reset_index(drop=True)
test_target=test_target.reset_index(drop=True)

df_train['income'] = train_target
df_val['income'] = val_target
df_test['income'] = test_target

<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: income, dtype: int64
<=50K    37155
>50K     11687
Name: income, dtype: int64


In [11]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.848889,-0.031239,-0.347969,-0.328625,1.14264,0.92141,1.195326,-0.277281,0.391832,-1.418827,-0.146864,-0.218806,-0.027944,0.275885,0
1,-0.048962,-0.031239,-0.643616,1.219254,-0.028229,-0.40909,1.662127,-0.8996,0.391832,0.704807,0.864927,-0.218806,0.214252,0.275885,1
2,0.023758,0.629238,0.150005,-0.586605,0.362061,-0.40909,-0.905278,-0.8996,0.391832,0.704807,-0.146864,-0.218806,-0.027944,0.275885,0
3,-0.485286,-0.031239,-0.502379,1.219254,-0.028229,-0.40909,-0.671878,-0.8996,0.391832,0.704807,1.936056,-0.218806,1.586693,0.275885,1
4,0.532803,-0.031239,0.077485,0.187335,-0.418518,-1.73959,0.261724,-0.277281,0.391832,0.704807,-0.146864,-0.218806,-0.027944,0.275885,0


In [12]:
df_val.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.678244,-0.031239,-0.143213,0.187335,-0.418518,-0.40909,0.028324,-0.8996,0.391832,0.704807,-0.146864,-0.218806,-0.027944,0.275885,0
1,2.205377,-2.673145,-0.652467,0.187335,-0.418518,-0.40909,-1.60548,-0.8996,0.391832,0.704807,-0.146864,-0.218806,-2.207704,0.275885,0
2,0.314641,-0.031239,-0.933246,0.445314,1.532929,0.92141,0.728525,-0.277281,0.391832,0.704807,-0.146864,-0.218806,1.586693,0.275885,1
3,-1.430654,-0.031239,1.18984,-1.360545,-2.369966,-0.40909,-0.205077,0.967355,0.391832,0.704807,-0.146864,-0.218806,0.375716,-1.521679,0
4,-0.558007,-0.031239,2.351111,0.187335,-0.418518,-0.40909,-0.905278,-0.8996,0.391832,0.704807,-0.146864,-0.218806,-0.027944,0.275885,0


In [13]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.485286,-1.352192,-0.121132,1.219254,-0.028229,0.25616,0.728525,-0.277281,0.391832,0.704807,-0.146864,-0.218806,0.779375,-3.59579,0
1,1.914495,-0.031239,1.507162,-0.328625,1.14264,-0.40909,-1.372079,-0.8996,0.391832,0.704807,1.323408,-0.218806,-1.642581,0.275885,1
2,-1.139772,-0.031239,0.067626,1.219254,-0.028229,0.92141,-1.372079,-0.277281,0.391832,-1.418827,-0.146864,-0.218806,-0.027944,0.275885,0
3,0.460082,1.289714,-0.634087,0.187335,-0.418518,-0.40909,-0.905278,-0.8996,0.391832,0.704807,-0.146864,-0.218806,0.779375,0.275885,0
4,-1.285213,-0.031239,-0.69464,0.187335,-0.418518,0.92141,-0.671878,0.967355,0.391832,0.704807,-0.146864,-0.218806,-0.027944,0.275885,0


In [14]:
#save csvs
df_train.to_csv('train.csv', index=False)
df_val.to_csv('validation.csv', index=False)
df_test.to_csv('test.csv', index=False)