# Data frame operations

In [1]:
#import libraries
import pandas as pd

In [32]:
#Load data
df=pd.read_csv('GageData.csv',
               dtype={'site_no':'str'},
               parse_dates=['datetime'])
df.dtypes

agency_cd               object
site_no                 object
datetime        datetime64[ns]
MeanFlow_cfs           float64
Confidence              object
dtype: object

## Subseting data: rows

#### `iloc`
One way is to select rows using their internal location, i.e. their row number. *(Recall that Python lists start at zero, not 1!)*

In [36]:
#Show the first 4 rows
df.iloc[0:4]

Unnamed: 0,agency_cd,site_no,datetime,MeanFlow_cfs,Confidence
0,USGS,2087500,1930-10-01,347.0,A
1,USGS,2087500,1930-10-02,173.0,A
2,USGS,2087500,1930-10-03,132.0,A
3,USGS,2087500,1930-10-04,125.0,A


In [39]:
#Show rows 100 thru 105; not the last number is not included in the slice
df.iloc[100:106]

Unnamed: 0,agency_cd,site_no,datetime,MeanFlow_cfs,Confidence
100,USGS,2087500,1931-01-09,860.0,A
101,USGS,2087500,1931-01-10,645.0,A
102,USGS,2087500,1931-01-11,595.0,A
103,USGS,2087500,1931-01-12,750.0,A
104,USGS,2087500,1931-01-13,1680.0,A
105,USGS,2087500,1931-01-14,1610.0,A


In [41]:
#Show the last 5 rows
df.iloc[-5:]

Unnamed: 0,agency_cd,site_no,datetime,MeanFlow_cfs,Confidence
31745,USGS,2087500,2017-08-30,278.0,A
31746,USGS,2087500,2017-08-31,314.0,A
31747,USGS,2087500,2017-09-01,812.0,A
31748,USGS,2087500,2017-09-02,1770.0,A
31749,USGS,2087500,2017-09-03,672.0,A


---
#### `loc`
Another way to select rows is to use explicit index values. Let's first examine this using the auto-generated indices created when we imported the CSV into a dataframe. Running the `index` function reveals that our initial index was assigned a sequential range of integers. 

In [46]:
#What does our index look like? 
df.index

RangeIndex(start=0, stop=31750, step=1)

In [43]:
#Show the rows corresponding to index values 6 thru 10
df.loc[6:10]

Unnamed: 0,agency_cd,site_no,datetime,MeanFlow_cfs,Confidence
6,USGS,2087500,1930-10-07,87.0,A
7,USGS,2087500,1930-10-08,87.0,A
8,USGS,2087500,1930-10-09,118.0,A
9,USGS,2087500,1930-10-10,118.0,A
10,USGS,2087500,1930-10-11,118.0,A


Now, let's change our index to values in the `datetime` column.

In [47]:
#Change the index to be values in the datetime column and display them
df.set_index('datetime',inplace=True)
df.index

DatetimeIndex(['1930-10-01', '1930-10-02', '1930-10-03', '1930-10-04',
               '1930-10-05', '1930-10-06', '1930-10-07', '1930-10-08',
               '1930-10-09', '1930-10-10',
               ...
               '2017-08-25', '2017-08-26', '2017-08-27', '2017-08-28',
               '2017-08-29', '2017-08-30', '2017-08-31', '2017-09-01',
               '2017-09-02', '2017-09-03'],
              dtype='datetime64[ns]', name='datetime', length=31750, freq=None)

In [50]:
#Show the row with the index matching Jan 1st, 1975
df.loc['1975-01-01']

agency_cd           USGS
site_no         02087500
MeanFlow_cfs        1170
Confidence             A
Name: 1975-01-01 00:00:00, dtype: object

In [55]:
#Show the rows for september 10 thru 15, 1998
df.loc['1998-09-10':'1998-09-15']

Unnamed: 0_level_0,agency_cd,site_no,MeanFlow_cfs,Confidence
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-09-10,USGS,2087500,364.0,A
1998-09-11,USGS,2087500,334.0,A
1998-09-12,USGS,2087500,308.0,A
1998-09-13,USGS,2087500,298.0,A
1998-09-14,USGS,2087500,288.0,A
1998-09-15,USGS,2087500,285.0,A


### Querying data
Moving away from indices, we can query records matching criteria

In [60]:
#Select rows where the Mean flow was less than 50 cfs
df.query('MeanFlow_cfs < 50')

Unnamed: 0_level_0,agency_cd,site_no,MeanFlow_cfs,Confidence
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1932-09-11,USGS,2087500,45.0,A
1932-09-15,USGS,2087500,48.0,A
1932-09-20,USGS,2087500,46.0,A
1933-10-08,USGS,2087500,49.0,A
1933-10-09,USGS,2087500,48.0,A


### Using masks to query data
Masks are binary columns of data, meaning values are either true or false

In [76]:
maskTinyFlow = df['MeanFlow_cfs'] < 53

In [77]:
df[maskTinyFlow]

Unnamed: 0_level_0,agency_cd,site_no,MeanFlow_cfs,Confidence
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1932-09-11,USGS,2087500,45.0,A
1932-09-12,USGS,2087500,50.0,A
1932-09-15,USGS,2087500,48.0,A
1932-09-20,USGS,2087500,46.0,A
1933-10-06,USGS,2087500,52.0,A
1933-10-07,USGS,2087500,52.0,A
1933-10-08,USGS,2087500,49.0,A
1933-10-09,USGS,2087500,48.0,A
1933-10-10,USGS,2087500,51.0,A
1954-10-11,USGS,2087500,51.0,A
