In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
COLUMNS = ["age", "workclass", "edu_level",
           "marital_status", "occupation", "relationship",
           "race", "sex", "hours_per_week",
           "native_country", "income"]

train_df = pd.read_csv(
    filepath_or_buffer="https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    names=COLUMNS,
    engine='python',
    usecols=[0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14],
    sep=r'\s*,\s*',
    na_values="?"
)

test_df = pd.read_csv(
    filepath_or_buffer="https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
    names=COLUMNS,
    skiprows=[0],
    engine='python',
    usecols=[0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14],
    sep=r'\s*,\s*',
    na_values="?"
)


In [3]:
# Drop rows with missing values
train_df = train_df.dropna(how="any", axis=0)
test_df = test_df.dropna(how="any", axis=0)

# To reduce the complexity, we binarize the attribute
# To reduce the complexity, we binarize the attribute


def mapping(tuple):
    # age, 37
    tuple['age'] = 1 if tuple['age'] > 37 else 0
    # workclass
    tuple['workclass'] = 0 if tuple['workclass'] != 'Private' else 1
    # edu-level
    tuple['edu_level'] = 1 if tuple['edu_level'] > 9 else 0
    # maritial statue
    tuple['marital_status'] = 1 if tuple['marital_status'] == "Married-civ-spouse" else 0
    # occupation
    tuple['occupation'] = 1 if tuple['occupation'] == "Craft-repair" else 0
    # relationship
    tuple['relationship'] = 0 if tuple['relationship'] == "Not-in-family" else 1
    # race
    tuple['race'] = 0 if tuple['race'] != "White" else 1
    # sex
    tuple['sex'] = 0 if tuple['sex'] != "Male" else 1
    # hours per week
    tuple['hours_per_week'] = 1 if tuple['hours_per_week'] > 40 else 0
    # native country
    tuple['native_country'] = 1 if tuple['native_country'] == "United-States" else 0
    # income
    tuple['income'] = 1 if tuple['income'] == '>50K' or tuple['income'] == '>50K.' else 0
    return tuple


train_df = train_df.apply(mapping, axis=1)
test_df = test_df.apply(mapping, axis=1)

In [4]:
train_df

Unnamed: 0,age,workclass,edu_level,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income
0,1,0,1,0,0,0,1,1,0,1,0
1,1,0,1,1,0,1,1,1,0,1,0
2,1,1,0,0,0,0,1,1,0,1,0
3,1,1,0,1,0,1,0,1,0,1,0
4,0,1,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
32556,0,1,1,1,0,1,1,0,0,1,0
32557,1,1,0,1,0,1,1,1,0,1,1
32558,1,1,0,0,0,1,1,0,0,1,0
32559,0,1,0,0,0,1,1,1,0,1,0


In [5]:
train_df["occupation"][:30]

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    1
28    0
29    1
30    0
31    0
Name: occupation, dtype: int64

In [20]:
train_data = torch.from_numpy(train_df.values)
test_data = torch.from_numpy(test_df.values)
print(train_data.size())
print(test_data.size())
dataset = torch.cat((train_data,test_data), 0)
print(dataset.size())

torch.Size([30162, 11])
torch.Size([15060, 11])
torch.Size([45222, 11])


In [25]:
print(dataset.size()[0])

45222


In [38]:
from torch.utils.data import Dataset,DataLoader
class AdultDataset(Dataset):
    def __init__(self, data_set):
        self.x = data_set
        self.len = data_set.size()[0]
    def __getitem__(self,index):
        return self.x[index]
    def __len__(self):
        return self.len
adultDataset = AdultDataset(dataset)
dataLoader = DataLoader(dataset=adultDataset, batch_size=128, shuffle=True)



In [39]:
dataIter = iter(dataLoader)
n = next(dataIter)

In [41]:
n

tensor([[0, 0, 0,  ..., 0, 1, 0],
        [1, 0, 1,  ..., 0, 1, 1],
        [0, 1, 0,  ..., 0, 1, 0],
        ...,
        [1, 1, 1,  ..., 0, 1, 0],
        [0, 1, 1,  ..., 1, 1, 1],
        [1, 1, 0,  ..., 0, 1, 0]])