In [4]:
%run "./obj/Array.ipynb"
%run "./obj/Index.ipynb"
%run "./obj/Series.ipynb"

# DataFrame Basics
DataFrame is a collection of an index (one for this class) and one or more series.  
The series has order, which makes them the `columns` of the dataframe.  

In [5]:
class DF():
    def __init__(self, data, index=None):
        self.data = data
        self.index = index
        self._validate()
        self.columns = [] # This is like data.keys() we can set the order
        self.data = {}
        for name, vals in data.items():
            self.columns.append((name))
            if isinstance(vals, Srs):
                vals = vals.values
            self.data[name] = Srs(vals, name=name, index=self.index)

    def _validate(self):
        first = list(self.data.values())[0]
        self.shape = (len(first), len(self.data.keys()))
        for col in self.data.values():
            assert len(col) == len(self), f"Not all columns are of same length {self.shape}, {len(col)}"
        self.index = self.index or Idx(range(len(self))) # In real code, this will not be in validate.
        assert len(self.index)
        
    def __len__(self):
        return self.shape[0]

In [6]:
data = {'flights':[10,12,50,40],
         'visited':[6,1,2,0],
         'country': ['Israel','Georgia','US','US']}
index = Idx(['tlv','tbs','jfk','ewr'])

In [7]:
df = DF(data, index)

In [8]:
print(df)
df.data['flights']

<__main__.DF object at 0x10d1172b0>


flights: 
 tlv    10  
 tbs    12  
 jfk    50  
 ewr    40  

# See the Data

In [9]:
def print_df(df):
    maxlen = max([len(col_name) for col_name in df.columns]) + 2 # Will make for nice spaces
    maxlen = max(maxlen, 8)
    ret = ''.center(maxlen) # Top-Left Corner
    for col_name in df.columns: # prints top row
        ret += col_name.center(maxlen)
    ret += '\n'
    collection = [df.index.values.data] + [df.data[col].values for col in df.columns]
    for row in zip(*collection):
        for cell in row:
            ret += str(cell).center(maxlen)
        ret += '\n'
    return ret

In [10]:
print (print_df(df))

          flights  visited  country 
   tlv       10       6      Israel 
   tbs       12       1     Georgia 
   jfk       50       2        US   
   ewr       40       0        US   



## Put in class

In [11]:
def spaced_row(row, maxlen):
    ret = ''
    for cell in row:
        ret += str(cell).center(maxlen)
    ret += '\n'
    return ret

In [12]:
class DF(DF):
    def __iter__(self):
        collection = [self.index.values.data] + [self.data[col].values for col in self.columns]
        return zip(*collection) 
    
    def __str__(self):
        maxlen = max([len(col_name) for col_name in self.columns]) + 2 # Will make for nice spaces
        maxlen = max(maxlen, 8)
        ret = ''.center(maxlen) # Top-Left Corner
        ret += spaced_row(self.columns, maxlen)
        for row in self: # This part changes
            ret += spaced_row(row, maxlen)
        return ret
    __repr__ = __str__

In [13]:
df = DF(data, index)

In [14]:
print(df)

          flights  visited  country 
   tlv       10       6      Israel 
   tbs       12       1     Georgia 
   jfk       50       2        US   
   ewr       40       0        US   



## Exercise: Make Nice Table for Notebook
The [`_repr_markdown_`](../Supplemental%20-%20Minimals.ipynb#Markdown-Repr) is method special for notebooks. If a class has this method, the notebook will render the markdown.

Creating a table in Markdown goes like this:
```
| Name  | Age | Gender |
|-------|-----|--------| # At least the `-`
| Dean  | 33  | M      |
| Inbal | 33  | F      |
| Idan  | 14  | M      |
```
| Name  | Age | Gender |
|-------|-----|--------|
| Dean  | 33  | M      |
| Inbal | 33  | F      |
| Idan  | 14  | M      |

In [15]:
# Ex
def markdown_row(row):
    # BOE
    ret = ''
    for cell in row:
        ret += '| ' + str(cell)
    ret += '|\n'
    return ret
    #EOE

class DF(DF):
    def _repr_markdown_(self):
        ret = '|' # Top-Left Corner
        ret += markdown_row(self.columns)
        # the --- row
        ret += markdown_row(['---'] * (self.shape[1] + 1))
        for row in self: # This part changes
            ret += markdown_row(row)
        return ret

In [16]:
df = DF(data, index)
df

|| flights| visited| country|
| ---| ---| ---| ---|
| tlv| 10| 6| Israel|
| tbs| 12| 1| Georgia|
| jfk| 50| 2| US|
| ewr| 40| 0| US|


# New column with `setitem`
We are used by now to `getitem`. `setitem` tells the object what to do when we set item like `df['my_col'] = [1,2,3,4]`

In [17]:
class DF(DF):    
    def __setitem__(self, key, value):
        assert len(value) == len(self), 'new series is not of same length as df'
        if key not in self.columns: # If it is in the columns, it just replaces the current
            self.columns.append(key) # This will change the shape
            df.shape = (len(value),len(self.columns)) # We will see a better way
        if isinstance(value, Srs): # We did this in __init__. Remember?
            value = value.values
        self.data[key] = Srs(value, name=key, index=self.index)
    
        

In [18]:
df = DF(data, index)
df['mayor'] = ['Huldai','Kaladze', 'De Blasio', 'Baraka']
df

|| flights| visited| country| mayor|
| ---| ---| ---| ---| ---|
| tlv| 10| 6| Israel| Huldai|
| tbs| 12| 1| Georgia| Kaladze|
| jfk| 50| 2| US| De Blasio|
| ewr| 40| 0| US| Baraka|


# Back to Slicing

In [19]:
class DF(DF):
    def __getitem__(self, items):
        if not isinstance(items, tuple):
            items = (items,)
        if len(items) == 1:
            # Bring all rows
            return self[:, items[0]]
        rowidx, colidx = items
        if colidx == slice(None):
            columns = self.columns
        elif isinstance(colidx, str):
            # Just return the series
            s = self.data[colidx]
            return s[rowidx]
        else:
            # Take relevant columns
            columns = [col for col in self.columns if col in colidx] # Do we have to create a new dataframe?
        data = {col: self.data[col][rowidx] for col in columns}
        index = data[columns[0]].index
        ret = DF(data, index=index)
        return ret

In [20]:
df = DF(data, index)

In [21]:
df['visited']

visited: 
 tlv    6   
 tbs    1   
 jfk    2   
 ewr    0   

In [22]:
df[['jfk','ewr'],:]

|| flights| visited| country|
| ---| ---| ---| ---|
| jfk| 50| 2| US|
| ewr| 40| 0| US|


In [24]:

df[['tlv','tbs'],['visited','flights']]

|| flights| visited|
| ---| ---| ---|
| tlv| 10| 6|
| tbs| 12| 1|


# attr access
In pandas we can address a column like a property. We do it with the `__getattr__`.

In [25]:
class DF(DF):
    def __getattr__(self, item):
        if item in self.columns:
            return self[:, item]
        raise AttributeError

In [26]:
df = DF(data,index)
df.flights

flights: 
 tlv    10  
 tbs    12  
 jfk    50  
 ewr    40  