In [2]:
from data import Data
import numpy as np
DATA_DIR = 'data/'

### Loading the data:

You can either call the Data object with a numpy array or you can pass in a path to a file.

In [3]:
data = np.loadtxt(DATA_DIR + 'train.csv', delimiter=',', dtype = str)
data_obj = Data(data = data)

In [4]:
data_obj2 = Data(fpath = DATA_DIR + 'test.csv')

### Attribute information

Each attribute is defined as a custom dictionary object with the below properties:

- Column name
- Column Index
- Possible values


#### Get the list of attributes:

Returns a dictionary that maps attribute names to their respective attribute objects

In [5]:
data_obj.attributes

{'cap-shape': <data.Attribute at 0x1e8c342bfd0>,
 'cap-surface': <data.Attribute at 0x1e8c342bed0>,
 'cap-color': <data.Attribute at 0x1e8c342be50>,
 'bruises': <data.Attribute at 0x1e8c342be10>,
 'gill-attachment': <data.Attribute at 0x1e8c342bdd0>,
 'gill-spacing': <data.Attribute at 0x1e8c342bd90>,
 'gill-size': <data.Attribute at 0x1e8c342bd50>,
 'gill-color': <data.Attribute at 0x1e8c342bd10>,
 'stalk-shape': <data.Attribute at 0x1e8c342bcd0>,
 'stalk-root': <data.Attribute at 0x1e8c342bc90>,
 'stalk-surface-above-ring': <data.Attribute at 0x1e8c342bc50>,
 'stalk-surface-below-ring': <data.Attribute at 0x1e8c342bc10>,
 'stalk-color-above-ring': <data.Attribute at 0x1e8c342bbd0>,
 'stalk-color-below-ring': <data.Attribute at 0x1e8c342ba50>,
 'veil-type': <data.Attribute at 0x1e8c342b510>,
 'veil-color': <data.Attribute at 0x1e8c342bb10>,
 'ring-number': <data.Attribute at 0x1e8c342bad0>,
 'ring-type': <data.Attribute at 0x1e8c342ba90>,
 'spore-print-color': <data.Attribute at 0x1e8

#### Fetch information for specific attribute: 

##### Column index:

In [6]:
data_obj.attributes['cap-color'].index

2

##### Possible values:

In [7]:
data_obj.attributes['cap-color'].possible_vals

array(['b', 'c', 'e', 'g', 'n', 'p', 'r', 'u', 'w', 'y'], dtype='<U24')

### Data selection:

#### Select rows:

Selects all the rows that match the attribute name and value. Returns a new copy of the Data object that is set to those selected rows.

In [8]:
data_subset = data_obj.get_row_subset('cap-color', 'b')

In [9]:
data_subset

<data.Data at 0x1e8b4fb3e50>

In [10]:
data_subset.raw_data

array([['e', 'x', 'y', ..., 'w', 'c', 'w'],
       ['p', 'f', 's', ..., 'h', 's', 'g'],
       ['p', 'x', 's', ..., 'h', 's', 'u'],
       ...,
       ['p', 'f', 's', ..., 'h', 's', 'g'],
       ['p', 'b', 'y', ..., 'r', 'v', 'g'],
       ['e', 'k', 'y', ..., 'w', 'c', 'w']], dtype='<U24')

#### Select Columns:

Select the column that corresponds to the attribute name. Actually, we can use this function to verify if the row subset call gave us the right result or not.

In [11]:
data_subset.get_column('cap-color')

array(['b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b'], dtype='<U24')

We can also select multiple columns:

In [14]:
data_subset.get_column(['cap-color', 'label'])

{'b': <data.Attribute at 0x1e8c3485750>,
 'e': <data.Attribute at 0x1e8b4fb5bd0>}