# Clean census data

## Import the data

In [1]:
import pandas as pd

data = pd.read_csv('data/census.csv')

## Gather basic information

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlgt           32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [3]:
data.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
for col in data.columns:
    data[col].dtype
    print([data.loc[0, col]])

[39]
[' State-gov']
[77516]
[' Bachelors']
[13]
[' Never-married']
[' Adm-clerical']
[' Not-in-family']
[' White']
[' Male']
[2174]
[0]
[40]
[' United-States']
[' <=50K']


It turns out that strings inside the DataFrame have leading whitespaces. This includes column names - let's fix it!

## Fixing whitespaces

In [5]:
categorical_cols = data.dtypes[data.dtypes == 'O'].index.tolist()

for col in categorical_cols:
    data[col] = [entry.lstrip() for entry in data[col]]
    
cols_fixed = [entry.lstrip() for entry in data.columns.tolist()]

In [6]:
data.rename(columns = dict(zip(data.columns.tolist(), cols_fixed)), inplace = True)

## Check results

In [7]:
for col in data.columns:
    data[col].dtype
    print([data.loc[0, col]])

[39]
['State-gov']
[77516]
['Bachelors']
[13]
['Never-married']
['Adm-clerical']
['Not-in-family']
['White']
['Male']
[2174]
[0]
[40]
['United-States']
['<=50K']


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Save the file

In [9]:
data.to_csv('data/census.csv')