In [1]:
import pandas as pd


frame = pd.read_csv('census.csv')

In [2]:
frame.columns = [col.replace('-', '_').strip() for col in frame.columns.tolist()]
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [3]:
def remove_dash(frame:pd.DataFrame, col_name:str) -> pd.DataFrame:
    """
    Remove the '-' char from any text column

    Input:
        frame: (pd.DataFrame)
        col_name: (str) Column that will replace the char value
    Ouput:
        frame_copy: (pd.DataFrame) replaced frame
    """
    frame_copy = frame.copy()
    frame_copy[col_name] = frame_copy[col_name].str.replace('-', '_')
    return frame_copy


def clean_spaces(frame:pd.DataFrame, col_name:str) -> pd.DataFrame:
    """
    Remove empty spaces from strings

    Input:
        frame: (pd.DataFrame)
        col_name: (str) Column that will remove the empty space
    Ouput:
        frame_copy: (pd.DataFrame) replaced frame
    """
    
    frame_copy = frame.copy()
    frame_copy[col_name] = frame_copy[col_name].apply(lambda x: str(x).strip())
    return frame_copy


for coluna in frame.select_dtypes('object').columns.tolist():
    frame = remove_dash(frame = frame, col_name = coluna)
    frame = clean_spaces(frame = frame, col_name = coluna)
frame

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State_gov,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,<=50K
1,50,Self_emp_not_inc,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States,<=50K
2,38,Private,215646,HS_grad,9,Divorced,Handlers_cleaners,Not_in_family,White,Male,0,0,40,United_States,<=50K
3,53,Private,234721,11th,7,Married_civ_spouse,Handlers_cleaners,Husband,Black,Male,0,0,40,United_States,<=50K
4,28,Private,338409,Bachelors,13,Married_civ_spouse,Prof_specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc_acdm,12,Married_civ_spouse,Tech_support,Wife,White,Female,0,0,38,United_States,<=50K
32557,40,Private,154374,HS_grad,9,Married_civ_spouse,Machine_op_inspct,Husband,White,Male,0,0,40,United_States,>50K
32558,58,Private,151910,HS_grad,9,Widowed,Adm_clerical,Unmarried,White,Female,0,0,40,United_States,<=50K
32559,22,Private,201490,HS_grad,9,Never_married,Adm_clerical,Own_child,White,Male,0,0,20,United_States,<=50K


In [4]:
#frame.to_csv('cleaned_census.csv', sep = ';', index = False)

In [13]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
frame.select_dtypes('object').columns.tolist()

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'salary']

In [14]:
dicionario = {}

for column in frame.columns.tolist():
    if column in frame.select_dtypes('object').columns.tolist():
        dicionario[column] = frame[column].unique().tolist()
    else:
        dicionario[column] = int


dicionario

{'age': int,
 'workclass': ['State_gov',
  'Self_emp_not_inc',
  'Private',
  'Federal_gov',
  'Local_gov',
  '?',
  'Self_emp_inc',
  'Without_pay',
  'Never_worked'],
 'fnlgt': int,
 'education': ['Bachelors',
  'HS_grad',
  '11th',
  'Masters',
  '9th',
  'Some_college',
  'Assoc_acdm',
  'Assoc_voc',
  '7th_8th',
  'Doctorate',
  'Prof_school',
  '5th_6th',
  '10th',
  '1st_4th',
  'Preschool',
  '12th'],
 'education_num': int,
 'marital_status': ['Never_married',
  'Married_civ_spouse',
  'Divorced',
  'Married_spouse_absent',
  'Separated',
  'Married_AF_spouse',
  'Widowed'],
 'occupation': ['Adm_clerical',
  'Exec_managerial',
  'Handlers_cleaners',
  'Prof_specialty',
  'Other_service',
  'Sales',
  'Craft_repair',
  'Transport_moving',
  'Farming_fishing',
  'Machine_op_inspct',
  'Tech_support',
  '?',
  'Protective_serv',
  'Armed_Forces',
  'Priv_house_serv'],
 'relationship': ['Not_in_family',
  'Husband',
  'Wife',
  'Own_child',
  'Unmarried',
  'Other_relative'],
 'rac

In [16]:
frame.loc[:, 'capital_balance'] = frame['capital_gain'] - frame['capital_loss']
frame.loc[(frame['capital_gain']> 0) | (frame['capital_loss']> 0)]

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary,capital_balance
0,39,State_gov,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,<=50K,2174
8,31,Private,45781,Masters,14,Never_married,Prof_specialty,Not_in_family,White,Female,14084,0,50,United_States,>50K,14084
9,42,Private,159449,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,5178,0,40,United_States,>50K,5178
23,43,Private,117037,11th,7,Married_civ_spouse,Transport_moving,Husband,White,Male,0,2042,40,United_States,<=50K,-2042
32,45,Private,386940,Bachelors,13,Divorced,Exec_managerial,Own_child,White,Male,0,1408,40,United_States,<=50K,-1408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32515,66,Federal_gov,47358,10th,6,Married_civ_spouse,Craft_repair,Husband,White,Male,3471,0,40,United_States,<=50K,3471
32518,57,Local_gov,110417,HS_grad,9,Married_civ_spouse,Craft_repair,Husband,White,Male,99999,0,40,United_States,>50K,99999
32538,38,Private,139180,Bachelors,13,Divorced,Prof_specialty,Unmarried,Black,Female,15020,0,45,United_States,>50K,15020
32548,65,Self_emp_not_inc,99359,Prof_school,15,Never_married,Prof_specialty,Not_in_family,White,Male,1086,0,60,United_States,<=50K,1086


In [21]:
frame.loc[(frame['capital_gain']> 0) & (frame['capital_loss'] > 0)]

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary,capital_balance


In [24]:
def remove_unwanted(frame:pd.DataFrame, value:str = '?'):
    base_frame = frame.copy()
    for column in frame.columns.tolist():
        if column in frame.select_dtypes('object').columns.tolist():
            frame = frame.loc[frame[column] != value]

    return frame

frame = remove_unwanted(frame)

In [26]:
dicionario = {}

for column in frame.columns.tolist():
    if column in frame.select_dtypes('object').columns.tolist():
        dicionario[column] = frame[column].unique().tolist()
    else:
        dicionario[column] = int


dicionario

{'age': int,
 'workclass': ['State_gov',
  'Self_emp_not_inc',
  'Private',
  'Federal_gov',
  'Local_gov',
  'Self_emp_inc',
  'Without_pay'],
 'fnlgt': int,
 'education': ['Bachelors',
  'HS_grad',
  '11th',
  'Masters',
  '9th',
  'Some_college',
  'Assoc_acdm',
  '7th_8th',
  'Doctorate',
  'Assoc_voc',
  'Prof_school',
  '5th_6th',
  '10th',
  'Preschool',
  '12th',
  '1st_4th'],
 'education_num': int,
 'marital_status': ['Never_married',
  'Married_civ_spouse',
  'Divorced',
  'Married_spouse_absent',
  'Separated',
  'Married_AF_spouse',
  'Widowed'],
 'occupation': ['Adm_clerical',
  'Exec_managerial',
  'Handlers_cleaners',
  'Prof_specialty',
  'Other_service',
  'Sales',
  'Transport_moving',
  'Farming_fishing',
  'Machine_op_inspct',
  'Tech_support',
  'Craft_repair',
  'Protective_serv',
  'Armed_Forces',
  'Priv_house_serv'],
 'relationship': ['Not_in_family',
  'Husband',
  'Wife',
  'Own_child',
  'Unmarried',
  'Other_relative'],
 'race': ['White',
  'Black',
  'Asia

In [3]:
import pandas as pd


pd.read_csv('inference_data.csv', sep = ',')

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,31,Private,210008,HS_grad,9,Never_married,Sales,Own_child,White,Female,0,0,40,United_States,<=50K


In [10]:
test = "Aloha %.2f (%.2f)" %(3.13, 0.14)
test

'Aloha 3.13 (0.14)'

In [1]:
type(None)

NoneType