Binarization

In [1]:
# Cell 1 - Import Libraries and Load Data
import pandas as pd

# Load the census data set. Note: The 'census.data' file should be downloaded and located in the specified directory.
data = pd.read_csv('../Data/census.data', header=None, index_col=False,
                   names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                          'marital-status', 'occupation', 'relationship', 'race', 'gender', 
                          'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])

# Confirm the data type is DataFrame
print("Data Type:", type(data))


Data Type: <class 'pandas.core.frame.DataFrame'>


In [2]:
# Cell 2 - Select Subset of Variables
# Select a subset of 4 variables: 'age', 'workclass', 'gender', 'income'
my_data = data[['age', 'workclass', 'gender', 'income']]
print("Selected Data Subset:\n", my_data.head())


Selected Data Subset:
    age          workclass   gender  income
0   39          State-gov     Male   <=50K
1   50   Self-emp-not-inc     Male   <=50K
2   38            Private     Male   <=50K
3   53            Private     Male   <=50K
4   28            Private   Female   <=50K


In [4]:
# Cell 3 - Check Data Types of the Selected Variables
# Display data types and structure of the selected subset
print("Data Types of the Selected Variables:")
my_data.info()


Data Types of the Selected Variables:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        32561 non-null  int64 
 1   workclass  32561 non-null  object
 2   gender     32561 non-null  object
 3   income     32561 non-null  object
dtypes: int64(1), object(3)
memory usage: 1017.7+ KB


In [None]:
# Cell 4 - Display Summary Statistics for Numeric Variables
# Display summary statistics for the numeric columns
print("Summary Statistics for Numeric Variables:")
print(my_data.describe())


In [None]:
# Cell 5 - Display Summary Including Categorical Variables
# Display summary statistics for all columns, including categorical variables
print("Summary Statistics Including Categorical Variables:")
print(my_data.describe(include='all'))


In [3]:
# Cell 6 - Display Unique Values in Categorical Variables
# Print the unique values for 'workclass', 'gender', and 'income'
print("Unique values in 'workclass':", my_data['workclass'].unique())
print("Unique values in 'gender':", my_data['gender'].unique())
print("Unique values in 'income':", my_data['income'].unique())


Unique values in 'workclass': [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
Unique values in 'gender': [' Male' ' Female']
Unique values in 'income': [' <=50K' ' >50K']


In [5]:
# Cell 7 - Binarize (One-Hot Encode) Categorical Variables
# Use pd.get_dummies() to binarize categorical variables in my_data
my_data_num = pd.get_dummies(my_data)

# Display the structure of the transformed DataFrame
print("Transformed Data Frame with Dummy Variables:")
my_data_num.info()


Transformed Data Frame with Dummy Variables:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   age                          32561 non-null  int64
 1   workclass_ ?                 32561 non-null  bool 
 2   workclass_ Federal-gov       32561 non-null  bool 
 3   workclass_ Local-gov         32561 non-null  bool 
 4   workclass_ Never-worked      32561 non-null  bool 
 5   workclass_ Private           32561 non-null  bool 
 6   workclass_ Self-emp-inc      32561 non-null  bool 
 7   workclass_ Self-emp-not-inc  32561 non-null  bool 
 8   workclass_ State-gov         32561 non-null  bool 
 9   workclass_ Without-pay       32561 non-null  bool 
 10  gender_ Female               32561 non-null  bool 
 11  gender_ Male                 32561 non-null  bool 
 12  income_ <=50K                32561 non-null  bool 
 13  i

In [None]:
# Cell 8 - Double-Check Transformation Results
# Inspect the new dummy variables and verify transformation
print("Inspection of Transformed Data Frame:")
my_data_num.info()

# Display unique values before transformation for comparison
print("Summary Including Categorical Variables (Original):")
print(my_data.describe(include='all'))
print("Unique values in 'workclass' (Original):", my_data['workclass'].unique())
print("Unique values in 'gender' (Original):", my_data['gender'].unique())
print("Unique values in 'income' (Original):", my_data['income'].unique())
