# Common Preprocessing
This file shows common preprossing steps for our tasks.
## Collect and Explore the Data
The first thing is to convert the data we collected to a format we can easily manipulate. Then we check the the size and type of data.

In [10]:
import pandas as pd

with open("adult.test", "r") as f:
    lines = f.readlines()
    lines = [line.replace('.','') for line in lines[1:]]
    data_test = pd.DataFrame([line.strip().split(", ") for line in lines])
    
# print(data_test)

data_train = pd.read_csv('adult.data', header = None)
data_train = data_train.replace(' ', '', regex=True)

print(data_train.shape)
print(data_test.shape)

print(data_train.head())

(32561, 15)
(16282, 15)
   0                 1       2          3   4                   5   \
0  39         State-gov   77516  Bachelors  13       Never-married   
1  50  Self-emp-not-inc   83311  Bachelors  13  Married-civ-spouse   
2  38           Private  215646    HS-grad   9            Divorced   
3  53           Private  234721       11th   7  Married-civ-spouse   
4  28           Private  338409  Bachelors  13  Married-civ-spouse   

                  6              7      8       9     10  11  12  \
0       Adm-clerical  Not-in-family  White    Male  2174   0  40   
1    Exec-managerial        Husband  White    Male     0   0  13   
2  Handlers-cleaners  Not-in-family  White    Male     0   0  40   
3  Handlers-cleaners        Husband  Black    Male     0   0  40   
4     Prof-specialty           Wife  Black  Female     0   0  40   

              13     14  
0  United-States  <=50K  
1  United-States  <=50K  
2  United-States  <=50K  
3  United-States  <=50K  
4           Cuba

In [11]:
data_train.columns = ['age','workclass','fnlwgt','education','educational_num','marital-status','occupation','relationship','race','gender','capital-gain','capital-loss', 'hours-per-week','native-country','income']
data_test.columns = ['age','workclass','fnlwgt','education','educational_num','marital-status','occupation','relationship','race','gender','capital-gain','capital-loss', 'hours-per-week','native-country','income']

## Data Cleaning 

### Data Transformation 
We convert the target atrribute (i.e., "<=50K" and >"50K") to binary values (i.e., 0 and 1) that we can easily manipulate.

In [12]:
data_train['income'].replace({"<=50K":0, ">50K":1}, inplace = True)
data_test['income'].replace({"<=50K":0, ">50K":1}, inplace = True)
data_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


### Missing Value

In [13]:
data_train.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational_num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

According to the results, there is no null value in the dataset. 

However, there are question marks (i.e., "?") to represent missing values. So we decided to replace them with the most frequent element (mode) in every colum.

In [14]:
count_before_tr = 0
count_before_te = 0

for i in range(len(data_train)):
    for j in range(len(data_train.columns)):
        if data_train.iloc[i][j] == "?":
            count_before_tr += 1

for i in range(len(data_test)):
    for j in range(len(data_test.columns)):
        if data_test.iloc[i][j] == "?":
            count_before_te += 1
        
print(count_before_tr)
print(count_before_te)



4262
2203


In [15]:
for col in data_train:
    mode = data_train[col].mode().values[0]
    data_train[col].replace("?", mode, inplace = True)

In [16]:
for col in data_test:
    mode = data_train[col].mode().values[0]
    data_test[col].replace("?", mode, inplace = True)

In [17]:
count_after_tr = 0
count_after_te = 0

for i in range(len(data_train)):
    for j in range(len(data_train.columns)):
        if data_train.iloc[i][j] == "?":
            count_after_tr +=1

for i in range(len(data_test)):
    for j in range(len(data_test.columns)):
        if data_train.iloc[i][j] == "?":
            count_after_tr +=1
        
print(count_after_tr)
print(count_after_te)


0
0


In [18]:
print(data_train.shape)
print(data_test.shape)

(32561, 15)
(16282, 15)


## Export Processed Data

In [19]:
data_train.to_csv('data_train.csv', index=False)
data_test.to_csv('data_test.csv', index=False)