In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import glob

### Clean Adult Dataset
https://archive.ics.uci.edu/ml/datasets/Adult

In [2]:
## Read dataset
df_adult = pd.read_csv('datasets/original/adult_train.csv')

In [3]:
df_adult.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation',
                    'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                    'salary']

#df_adult.replace({'?': np.nan}).dropna()

In [4]:
## Apply one hot encoding to categorical attributes
categorical_attributes = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                          'native-country', 'salary']

for categorical_attribute in categorical_attributes:
    
    ## Strip white space while encoding
    one_hot = pd.get_dummies(df_adult[categorical_attribute].str.strip(), prefix=categorical_attribute)
    
    ## Drop columns that are encoded for unknown values
    try:
        one_hot = one_hot.drop([categorical_attribute + '_?'], axis=1)
    except:
        print(categorical_attribute + '_? does not exist.')
    
    ## Replace original column with encoded columns
    df_adult = df_adult.drop([categorical_attribute], axis=1)
    df_adult = pd.concat([df_adult, one_hot], axis=1)

education_? does not exist.
marital-status_? does not exist.
relationship_? does not exist.
race_? does not exist.
sex_? does not exist.
salary_? does not exist.


In [5]:
## Drop the target column 'salary_<=50K' because 'salary_>50K' alone is enough
df_adult = df_adult.drop(['salary_<=50K'], axis=1)

In [6]:
## Normalize data in each column
x = df_adult.values
attributes = df_adult.columns.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_adult = pd.DataFrame(x_scaled)
df_adult.columns = attributes

In [7]:
## Save cleaned dataset
df_adult.to_csv('datasets/cleaned/adult_cleaned.csv', index=False)

### Clean Occupancy Dataset
https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+

In [8]:
## Read dataset
df_occupancy_train = pd.read_csv('datasets/original/occupancy/occupancy_train.csv')
df_occupancy_test1 = pd.read_csv('datasets/original/occupancy/occupancy_test1.csv')
df_occupancy_test2 = pd.read_csv('datasets/original/occupancy/occupancy_test2.csv')
df_occupancy = pd.concat([df_occupancy_train, df_occupancy_test1, df_occupancy_test2], axis=0)

In [9]:
## Drop 'data' column
df_occupancy = df_occupancy.drop(['date'], axis=1)

In [10]:
## Normalize data in each column
x = df_occupancy.values
attributes = df_occupancy.columns.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_occupancy = pd.DataFrame(x_scaled)
df_occupancy.columns = attributes

In [11]:
## Save cleaned dataset
df_occupancy.to_csv('datasets/cleaned/occupancy_cleaned.csv', index=False)

### Clean HTRU2 Dataset
https://archive.ics.uci.edu/ml/datasets/HTRU2

In [12]:
## Read dataset
df_HTRU2 = pd.read_csv('datasets/original/HTRU2_train.csv')

In [13]:
## ip -> integrated profile; ds -> DM-SMR curve
df_HTRU2.columns = ['ip_mean', 'ip_standard_deviation', 'ip_excess_kurtosis', 'ip_skewness',
                    'ds_mean', 'ds_standard_deviation', 'ds_excess_kurtosis', 'ds_skewness', 'is_pulsar']

In [14]:
## Normalize data in each column
x = df_HTRU2.values
attributes = df_HTRU2.columns.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_HTRU2 = pd.DataFrame(x_scaled)
df_HTRU2.columns = attributes

In [15]:
## Save cleaned dataset
df_HTRU2.to_csv('datasets/cleaned/HTRU2_cleaned.csv', index=False)

### Clean Activity Dataset
https://archive.ics.uci.edu/ml/datasets/Activity+recognition+with+healthy+older+people+using+a+batteryless+wearable+sensor

In [16]:
## Read dataset

## Attribute encoding
#  time -> Time in seconds
#  acc_f -> Acceleration reading in G for frontal axis
#  acc_v -> Acceleration reading in G for vertical axis
#  acc_l -> Acceleration reading in G for lateral axis
#  sensor_id -> Id of antenna reading sensor
#  rssi -> Received signal strength indicator (RSSI)
#  phase -> Phase
#  frequency -> Frequency
#  activity -> Label of activity, 1: sit on bed, 2: sit on chair, 3: lying, 4: ambulating

activity_dataset_path = 'datasets/original/activity/S1_Dataset'
activity_dataset_files = glob.glob(activity_dataset_path + "/*.csv")

dfs_activity = []

for activity_dataset_file in activity_dataset_files:
    df_activity = pd.read_csv(activity_dataset_file, header=None)
    dfs_activity.append(df_activity)
    
df_activity = pd.concat(dfs_activity, axis=0, ignore_index=True)
df_activity.columns = ['time', 'acc_f', 'acc_v', 'acc_l', 'sensor_id', 'rssi', 'phase', 'frequency', 'activity']

In [17]:
## Drop 'time' column
df_activity = df_activity.drop(['time'], axis=1)

In [18]:
## Apply one hot encoding to categorical attributes
categorical_attributes = ['sensor_id', 'activity']

for categorical_attribute in categorical_attributes:

    one_hot = pd.get_dummies(df_activity[categorical_attribute], prefix=categorical_attribute)
    
    ## Replace original column with encoded columns
    df_activity = df_activity.drop([categorical_attribute], axis=1)
    df_activity = pd.concat([df_activity, one_hot], axis=1)

In [19]:
## Drop 'activity_1', 'activity_2', and 'activity_3' to make this a binary classification dataset.  
##   And since we are more interested in whether the person is moving, we keep 'activity_4' as 
##   the binary classification label. 
df_activity = df_activity.drop(['activity_1', 'activity_2', 'activity_3'], axis=1)

## Rename 'activity_4' to 'is_ambulating'
df_activity = df_activity.rename(columns={'activity_4': 'is_ambulating'})

In [20]:
## Normalize data in each column
x = df_activity.values
attributes = df_activity.columns.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_activity = pd.DataFrame(x_scaled)
df_activity.columns = attributes

In [21]:
## Save cleaned dataset
df_activity.to_csv('datasets/cleaned/activity_cleaned.csv', index=False)