In [1]:
import pandas as pd
import numpy as np
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df=pd.read_csv(url)
city_mpg = df.city08
highway_mpg = df.highway08
make = df.make
url = 'https://github.com/mattharrison/datasets/raw/master/data/alta-noaa-1980-2019.csv'
alta_df = pd.read_csv(url)
dates = pd.to_datetime(alta_df.DATE)
snow = (alta_df.SNOW.rename(dates))

  df=pd.read_csv(url)


# Dataframes
### 16.1 Database and Spreadsheet Analogues
If you think of a datafframe as row-oriented, the interface will feel wrong.
Many tabular data structures are row-orientated. Perhaps this is due to spreadsheets and CSV files dealt with on a row by row basis. 

A `DataFrame` is often used for analytical purposes and is better understood when thought of as column-oriented, where each column is a `Series`. 

### 16.2 A simple Python Version
To create a tabular Python data structure that is column-oriented. It has a 0-based integer index, but not required, the index can also be string based:

In [4]:
df = {
    'index':[0,1,2],
    'cols':[
        {'name':'growth',
         'data':[.5,.7,1.2]},
         {'name':'Name',
          'data':['Paul','George','Ringo']}
    ]
}
df

{'index': [0, 1, 2],
 'cols': [{'name': 'growth', 'data': [0.5, 0.7, 1.2]},
  {'name': 'Name', 'data': ['Paul', 'George', 'Ringo']}]}

In [6]:
def get_row(df, idx):
    results = []
    value_idx = df['index'].index(idx)
    for col in df['cols']:
        results.append(col['data'][value_idx])
    return results
get_row(df, 1)

[0.7, 'George']

In [10]:
def get_col(df, name):
    for col in df['cols']:
        if col['name'] ==name:
            return col['data']
get_col(df, 'growth'), get_col(df, 'Name')

([0.5, 0.7, 1.2], ['Paul', 'George', 'Ringo'])

### 16.3 Dataframes
With pandas:


In [11]:
df = pd.DataFrame({
    'growth':[.5, .7, 1.2],
    'Name': ['Paul', 'George', 'Ringo']
})
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George
2,1.2,Ringo


In [12]:
df.iloc[2]

growth      1.2
Name      Ringo
Name: 2, dtype: object

In [14]:
df['Name']

0      Paul
1    George
2     Ringo
Name: Name, dtype: object

### 16.4 Construction

dataframes can be created from many types of input
- columns (dicts of lists)
- rows (list of dicts)
- CSV files (pd.read_csv)
- NumPy ndarrays
- other: SQL, HDF5, arrow, etc.

last df was made from columns, this one is made from rows:

In [16]:
pd.DataFrame([
    {'growth':.5, 'Name':'Paul'},
    {'growth':.7, 'Name':'George'},
    {'growth':1.2, 'Name':'Ringo'},
])

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George
2,1.2,Ringo


In [31]:
from io import StringIO
csv_file=StringIO("""growth,Name
.5,Paul
.7,George
1.2,Ringo""")
df=pd.read_csv(csv_file)

In [20]:
np.random.seed(42)
pd.DataFrame(np.random.randn(10,3),
             columns=['a', 'b', 'c'])

Unnamed: 0,a,b,c
0,0.496714,-0.138264,0.647689
1,1.52303,-0.234153,-0.234137
2,1.579213,0.767435,-0.469474
3,0.54256,-0.463418,-0.46573
4,0.241962,-1.91328,-1.724918
5,-0.562288,-1.012831,0.314247
6,-0.908024,-1.412304,1.465649
7,-0.225776,0.067528,-1.424748
8,-0.544383,0.110923,-1.150994
9,0.375698,-0.600639,-0.291694


### 16.5 Dataframe Axis
- dataframe has two axis
- referred as axis 0 and 1, or as the "index"  (or 'rows') and the "columns" axis:


In [22]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['growth', 'Name'], dtype='object')]

In [33]:
df.sum(axis=0)

growth                2.4
Name      PaulGeorgeRingo
dtype: object

In [32]:
df.sum(axis='columns')

TypeError: unsupported operand type(s) for +: 'float' and 'str'