## Data Import

In [1]:
import pandas as pd
import numpy as np
import pandas_profiling


[remove white spaces](https://stackoverflow.com/questions/43332057/pandas-strip-white-space)

In [2]:
df = pd.read_csv('../data/census.csv', skipinitialspace=True)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Data Profiling
bird eye view on available data, columns range, point statistics and histogram.

In [3]:
profile = pandas_profiling.ProfileReport(df)
profile.to_widgets()

# Data Cleaing

* drop duplicate rows
* replacing ? if present in some columns with None
* dropping a bias source in "capital-loss" & "capital-gain" as they have >90% zeros 

In [8]:
df_clean = df.copy(deep=True)

df_clean.drop_duplicates(inplace=True)
df_clean.replace({'?': np.nan},inplace = True)
#df_clean.dropna(axis='columns', how='any', inplace=True) #to be imputed later
df_clean.drop("capital-gain",axis=1,inplace=True)
df_clean.drop("capital-loss",axis=1,inplace=True)

#to prepared data for prediction
df_clean.salary = df_clean.salary.replace('<=50K', 0)
df_clean.salary = df_clean.salary.replace('>50K', 1)

#further
df_clean.drop("education-num", axis=1, inplace=True) #encoding eduction
df_clean.drop("fnlgt", axis=1, inplace=True) #irrelevent feture: final weight

print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32537 entries, 0 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   workclass       30701 non-null  object
 2   education       32537 non-null  object
 3   marital-status  32537 non-null  object
 4   occupation      30694 non-null  object
 5   relationship    32537 non-null  object
 6   race            32537 non-null  object
 7   sex             32537 non-null  object
 8   hours-per-week  32537 non-null  int64 
 9   native-country  31955 non-null  object
 10  salary          32537 non-null  object
dtypes: int64(2), object(9)
memory usage: 3.0+ MB
None


In [9]:
df_clean.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [26]:
clean_profile = pandas_profiling.ProfileReport(df_clean)
clean_profile.to_widgets()

Summarize dataset: 100%|██████████| 25/25 [00:10<00:00,  2.38it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.61s/it]
Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  nu

In [13]:
df_clean.to_csv('./data/clean_census_data.csv', index=False)

VBox(children=(Label(value=' 0.08MB of 0.08MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…