In [4]:
import pandas as pd


frame = pd.read_csv('census.csv')

In [7]:
frame.columns = [col.replace('-', '_').strip() for col in frame.columns.tolist()]
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [15]:
def remove_dash(frame:pd.DataFrame, col_name:str) -> pd.DataFrame:
    """
    Remove the '-' char from any text column

    Input:
        frame: (pd.DataFrame)
        col_name: (str) Column that will replace the char value
    Ouput:
        frame_copy: (pd.DataFrame) replaced frame
    """
    frame_copy = frame.copy()
    frame_copy[col_name] = frame_copy[col_name].str.replace('-', '_')
    return frame_copy


def clean_spaces(frame:pd.DataFrame, col_name:str) -> pd.DataFrame:
    """
    Remove empty spaces from strings

    Input:
        frame: (pd.DataFrame)
        col_name: (str) Column that will remove the empty space
    Ouput:
        frame_copy: (pd.DataFrame) replaced frame
    """
    
    frame_copy = frame.copy()
    frame_copy[col_name] = frame_copy[col_name].apply(lambda x: str(x).strip())
    return frame_copy


for coluna in frame.select_dtypes('object').columns.tolist():
    frame = remove_dash(frame = frame, col_name = coluna)
    frame = clean_spaces(frame = frame, col_name = coluna)
frame

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State_gov,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,<=50K
1,50,Self_emp_not_inc,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States,<=50K
2,38,Private,215646,HS_grad,9,Divorced,Handlers_cleaners,Not_in_family,White,Male,0,0,40,United_States,<=50K
3,53,Private,234721,11th,7,Married_civ_spouse,Handlers_cleaners,Husband,Black,Male,0,0,40,United_States,<=50K
4,28,Private,338409,Bachelors,13,Married_civ_spouse,Prof_specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc_acdm,12,Married_civ_spouse,Tech_support,Wife,White,Female,0,0,38,United_States,<=50K
32557,40,Private,154374,HS_grad,9,Married_civ_spouse,Machine_op_inspct,Husband,White,Male,0,0,40,United_States,>50K
32558,58,Private,151910,HS_grad,9,Widowed,Adm_clerical,Unmarried,White,Female,0,0,40,United_States,<=50K
32559,22,Private,201490,HS_grad,9,Never_married,Adm_clerical,Own_child,White,Male,0,0,20,United_States,<=50K


In [None]:
frame.to_csv('cleaned_census.csv', sep = ';', index = False)