In [70]:
import pandas as pd
import numpy as np

# Read the dataset
df = pd.read_csv('./adult/adult.data', skipinitialspace=True)


In [71]:
df['sex']
df['sex'] = df['sex'].map({'Female': 0, 'Male': 1})
df['sex']


0        1
1        1
2        1
3        1
4        0
        ..
32556    0
32557    1
32558    0
32559    1
32560    0
Name: sex, Length: 32561, dtype: int64

In [72]:

# Define which columns are numeric, categorical, and binary
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
binary_features = ['sex', 'income']

# Convert binary features to 0 and 1
print(df['sex'])
df['sex'] = df['sex'].map({'Female': 0, 'Male': 1})
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})

df['age'] = df['age'] / df['age'].max()
df['fnlwgt'] = df['fnlwgt'] / df['fnlwgt'].max()
df['education-num'] = df['education-num'] / df['education-num'].max()
df['capital-gain'] = df['capital-gain'] / df['capital-gain'].max()
df['capital-loss'] = df['capital-loss'] / df['capital-loss'].max()
df['hours-per-week'] = df['hours-per-week'] / df['hours-per-week'].max()

# Each category is now represented by 0 to length of category - 1
cat_workclass = pd.get_dummies(df['workclass']).columns
cat_education = pd.get_dummies(df['education']).columns
cat_marital_status = pd.get_dummies(df['marital-status']).columns
cat_occupation = pd.get_dummies(df['occupation']).columns
cat_relationship = pd.get_dummies(df['relationship']).columns
cat_race = pd.get_dummies(df['race']).columns
cat_nativecountry = pd.get_dummies(df['native-country']).columns

df['workclass'] = pd.Categorical(df['workclass'], categories=cat_workclass).codes
df['education'] = pd.Categorical(df['education'], categories=cat_education).codes
df['marital-status'] = pd.Categorical(df['marital-status'], categories=cat_marital_status).codes
df['occupation'] = pd.Categorical(df['occupation'], categories=cat_occupation).codes
df['relationship'] = pd.Categorical(df['relationship'], categories=cat_relationship).codes
df['race'] = pd.Categorical(df['race'], categories=cat_race).codes
df['native-country'] = pd.Categorical(df['native-country'], categories=cat_nativecountry).codes

# Normalize dividing by the max
df['workclass'] = df['workclass'] / df['workclass'].max()
df['education'] = df['education'] / df['education'].max()
df['marital-status'] = df['marital-status'] / df['marital-status'].max()
df['occupation'] = df['occupation'] / df['occupation'].max()
df['relationship'] = df['relationship'] / df['relationship'].max()
df['race'] = df['race'] / df['race'].max()
df['native-country'] = df['native-country'] / df['native-country'].max()

# Convert all numbrers to 4 decimal places
df = df.round(4)

df


0        1
1        1
2        1
3        1
4        0
        ..
32556    0
32557    1
32558    0
32559    1
32560    0
Name: sex, Length: 32561, dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.4333,0.875,0.0522,0.6000,0.8125,0.6667,0.0714,0.2,1.0,,0.0217,0.0,0.4040,0.9512,0
1,0.5556,0.750,0.0561,0.6000,0.8125,0.3333,0.2857,0.0,1.0,,0.0000,0.0,0.1313,0.9512,0
2,0.4222,0.500,0.1452,0.7333,0.5625,0.0000,0.4286,0.2,1.0,,0.0000,0.0,0.4040,0.9512,0
3,0.5889,0.500,0.1581,0.0667,0.4375,0.3333,0.4286,0.0,0.5,,0.0000,0.0,0.4040,0.9512,0
4,0.3111,0.500,0.2279,0.6000,0.8125,0.3333,0.7143,1.0,0.5,,0.0000,0.0,0.4040,0.1220,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.3000,0.500,0.1733,0.4667,0.7500,0.3333,0.9286,1.0,1.0,,0.0000,0.0,0.3838,0.9512,0
32557,0.4444,0.500,0.1040,0.7333,0.5625,0.3333,0.5000,0.0,1.0,,0.0000,0.0,0.4040,0.9512,1
32558,0.6444,0.500,0.1023,0.7333,0.5625,1.0000,0.0714,0.8,1.0,,0.0000,0.0,0.4040,0.9512,0
32559,0.2444,0.500,0.1357,0.7333,0.5625,0.6667,0.0714,0.6,1.0,,0.0000,0.0,0.2020,0.9512,0


In [79]:


# Save the normalized dataset
cf = int(df.shape[0] * 0.8)
df[:cf].to_csv('train.csv', index=False)
df[cf:].to_csv('test.csv', index=False)