# Object creation

In [1]:
import pandas as pd
import numpy as np

##### Basic series; default integer index

In [2]:
my_series = pd.Series([1,3,5,np.nan,6,8])
my_series

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

##### datetime index

In [3]:
my_dates_index = pd.date_range('20160101', periods=6)
my_dates_index

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

##### sample NumPy data

In [4]:
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
sample_numpy_data

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

##### sample data frame, with column headers; uses our dates_index

In [5]:
sample_df = pd.DataFrame(sample_numpy_data, index=my_dates_index, columns=list('ABCD'))
sample_df

Unnamed: 0,A,B,C,D
2016-01-01,0,1,2,3
2016-01-02,4,5,6,7
2016-01-03,8,9,10,11
2016-01-04,12,13,14,15
2016-01-05,16,17,18,19
2016-01-06,20,21,22,23


##### data frame from a Python dictionary

In [6]:
df_from_dictionary = pd.DataFrame({ 
                         'float' : 1.,
                         'time' : pd.Timestamp('20160825'),
                         'series' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'array' : np.array([3] * 4,dtype='int32'),
                         'categories' : pd.Categorical(["test","train","taxes","tools"]),
                         'dull' : 'boring data' 
                      })
df_from_dictionary

Unnamed: 0,array,categories,dull,float,series,time
0,3,test,boring data,1.0,1.0,2016-08-25
1,3,train,boring data,1.0,1.0,2016-08-25
2,3,taxes,boring data,1.0,1.0,2016-08-25
3,3,tools,boring data,1.0,1.0,2016-08-25


##### pandas retains data type for each column

In [7]:
df_from_dictionary.dtypes

array                  int32
categories          category
dull                  object
float                float64
series               float32
time          datetime64[ns]
dtype: object

##### head and tail; default is 5 rows

In [8]:
sample_df.head()

Unnamed: 0,A,B,C,D
2016-01-01,0,1,2,3
2016-01-02,4,5,6,7
2016-01-03,8,9,10,11
2016-01-04,12,13,14,15
2016-01-05,16,17,18,19


In [9]:
sample_df.tail(2)

Unnamed: 0,A,B,C,D
2016-01-05,16,17,18,19
2016-01-06,20,21,22,23


##### underlying data: values, index and columns

In [10]:
sample_df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [11]:
sample_df.index

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
sample_df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

##### describe(): a quick statistical summary
- notice: integer data summarized with floating point numbers

In [13]:
sample_df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.483315,7.483315,7.483315,7.483315
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


##### control precision of floating point numbers

In [14]:
pd.set_option('display.precision', 2)

In [15]:
sample_df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.48,7.48,7.48,7.48
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


##### transpose rows and columns

In [16]:
sample_df.T

Unnamed: 0,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00
A,0,4,8,12,16,20
B,1,5,9,13,17,21
C,2,6,10,14,18,22
D,3,7,11,15,19,23


##### sort by axis

In [17]:
sample_df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2016-01-01,3,2,1,0
2016-01-02,7,6,5,4
2016-01-03,11,10,9,8
2016-01-04,15,14,13,12
2016-01-05,19,18,17,16
2016-01-06,23,22,21,20


##### sort by data within a column (our data was already sorted)

In [18]:
sample_df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2016-01-06,20,21,22,23
2016-01-05,16,17,18,19
2016-01-04,12,13,14,15
2016-01-03,8,9,10,11
2016-01-02,4,5,6,7
2016-01-01,0,1,2,3


# Selection

In [19]:
import pandas as pd
import numpy as np

In [20]:
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
dates_index = pd.date_range('20160101', periods=6)
sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCD'))
sample_df

Unnamed: 0,A,B,C,D
2016-01-01,0,1,2,3
2016-01-02,4,5,6,7
2016-01-03,8,9,10,11
2016-01-04,12,13,14,15
2016-01-05,16,17,18,19
2016-01-06,20,21,22,23


##### selection using column name

In [21]:
sample_df['C']

2016-01-01     2
2016-01-02     6
2016-01-03    10
2016-01-04    14
2016-01-05    18
2016-01-06    22
Freq: D, Name: C, dtype: int64

##### selection using slice
- remember: up to, but not including second index

In [22]:
sample_df[1:4]

Unnamed: 0,A,B,C,D
2016-01-02,4,5,6,7
2016-01-03,8,9,10,11
2016-01-04,12,13,14,15


##### selection using date time index
- note: last index is included

In [24]:
sample_df['2016-01-01':'2016-01-04']

Unnamed: 0,A,B,C,D
2016-01-01,0,1,2,3
2016-01-02,4,5,6,7
2016-01-03,8,9,10,11
2016-01-04,12,13,14,15


### Selection by label

In [25]:
sample_df.loc[dates_index[1:3]]

Unnamed: 0,A,B,C,D
2016-01-02,4,5,6,7
2016-01-03,8,9,10,11


##### Selecting using multi-axis by label

In [27]:
sample_df.loc[:,['A','B']]

Unnamed: 0,A,B
2016-01-01,0,1
2016-01-02,4,5
2016-01-03,8,9
2016-01-04,12,13
2016-01-05,16,17
2016-01-06,20,21


##### Label slicing, both endpoints are included

In [28]:
sample_df.loc['2016-01-01':'2016-01-03',['A','B']]

Unnamed: 0,A,B
2016-01-01,0,1
2016-01-02,4,5
2016-01-03,8,9


##### Reduce number of dimensions for returned object
- notice order of 'D' and 'B'

In [29]:
sample_df.loc['2016-01-03',['D','B']]

D    11
B     9
Name: 2016-01-03 00:00:00, dtype: int64

##### using result

In [30]:
sample_df.loc['2016-01-03',['D','B']] [0] * 4

44

##### select a scalar

In [31]:
sample_df.loc[dates_index[2], 'C']

10

### Selection by Position

In [32]:
sample_numpy_data[3]

array([12, 13, 14, 15])

In [33]:
sample_df.iloc[3]

A    12
B    13
C    14
D    15
Name: 2016-01-04 00:00:00, dtype: int64

##### integer slices

In [34]:
sample_df.iloc[1:3, 2:4]

Unnamed: 0,C,D
2016-01-02,6,7
2016-01-03,10,11


##### lists of integers

In [35]:
sample_df.iloc[[0,1,3], [0,2]]

Unnamed: 0,A,C
2016-01-01,0,2
2016-01-02,4,6
2016-01-04,12,14


##### slicing rows explicitly
implicitly selecting all columns

In [36]:
sample_df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2016-01-02,4,5,6,7
2016-01-03,8,9,10,11


##### slicing columns explicitly
implicitly selecting all rows

In [37]:
sample_df.iloc[:, 1:3]

Unnamed: 0,B,C
2016-01-01,1,2
2016-01-02,5,6
2016-01-03,9,10
2016-01-04,13,14
2016-01-05,17,18
2016-01-06,21,22


### Boolean Indexing
##### test based upon one column's data

In [38]:
sample_df.C >= 14

2016-01-01    False
2016-01-02    False
2016-01-03    False
2016-01-04     True
2016-01-05     True
2016-01-06     True
Freq: D, Name: C, dtype: bool

##### test based upon entire data set

In [39]:
sample_df[sample_df >= 11]

Unnamed: 0,A,B,C,D
2016-01-01,,,,
2016-01-02,,,,
2016-01-03,,,,11.0
2016-01-04,12.0,13.0,14.0,15.0
2016-01-05,16.0,17.0,18.0,19.0
2016-01-06,20.0,21.0,22.0,23.0


##### isin() method
Returns a boolean Series showing whether each element in the Series is exactly contained in the passed sequence of values.

In [40]:
sample_df_2 = sample_df.copy()
sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']
sample_df_2

Unnamed: 0,A,B,C,D,Fruits
2016-01-01,0,1,2,3,apple
2016-01-02,4,5,6,7,orange
2016-01-03,8,9,10,11,banana
2016-01-04,12,13,14,15,strawberry
2016-01-05,16,17,18,19,blueberry
2016-01-06,20,21,22,23,pineapple


select rows where 'Fruits' column contains either 'banana' or 'pineapple'; notice 'smoothy', which is not in the column

In [41]:
sample_df_2[sample_df_2['Fruits'].isin(['banana','pineapple', 'smoothy'])]

Unnamed: 0,A,B,C,D,Fruits
2016-01-03,8,9,10,11,banana
2016-01-06,20,21,22,23,pineapple


# Assignment statements

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [43]:
starting_date = '20160701'
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
dates_index = pd.date_range(starting_date, periods=6)
sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCD'))
sample_df

Unnamed: 0,A,B,C,D
2016-07-01,0,1,2,3
2016-07-02,4,5,6,7
2016-07-03,8,9,10,11
2016-07-04,12,13,14,15
2016-07-05,16,17,18,19
2016-07-06,20,21,22,23


In [44]:
sample_df_2 = sample_df.copy()
sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']
sample_df_2

Unnamed: 0,A,B,C,D,Fruits
2016-07-01,0,1,2,3,apple
2016-07-02,4,5,6,7,orange
2016-07-03,8,9,10,11,banana
2016-07-04,12,13,14,15,strawberry
2016-07-05,16,17,18,19,blueberry
2016-07-06,20,21,22,23,pineapple


Setting a new column automatically aligns the data by the indexes

In [46]:
pd.date_range(starting_date, periods=6)

DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',
               '2016-07-05', '2016-07-06'],
              dtype='datetime64[ns]', freq='D')

In [47]:
pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))

2016-07-01    1
2016-07-02    2
2016-07-03    3
2016-07-04    4
2016-07-05    5
2016-07-06    6
Freq: D, dtype: int64

In [48]:
sample_series = pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))
sample_df_2['Extra Data'] = sample_series *3 +1
sample_df_2

Unnamed: 0,A,B,C,D,Fruits,Extra Data
2016-07-01,0,1,2,3,apple,4
2016-07-02,4,5,6,7,orange,7
2016-07-03,8,9,10,11,banana,10
2016-07-04,12,13,14,15,strawberry,13
2016-07-05,16,17,18,19,blueberry,16
2016-07-06,20,21,22,23,pineapple,19


##### Setting values by label

In [49]:
sample_df_2.at[dates_index[3],'Fruits'] = 'pear'
sample_df_2

Unnamed: 0,A,B,C,D,Fruits,Extra Data
2016-07-01,0,1,2,3,apple,4
2016-07-02,4,5,6,7,orange,7
2016-07-03,8,9,10,11,banana,10
2016-07-04,12,13,14,15,pear,13
2016-07-05,16,17,18,19,blueberry,16
2016-07-06,20,21,22,23,pineapple,19


##### Setting values by position
iat provides integer based lookups

In [51]:
sample_df_2.iat[3,2] = 4444
sample_df_2

Unnamed: 0,A,B,C,D,Fruits,Extra Data
2016-07-01,0,1,2,3,apple,4
2016-07-02,4,5,6,7,orange,7
2016-07-03,8,9,10,11,banana,10
2016-07-04,12,13,4444,15,pear,13
2016-07-05,16,17,18,19,blueberry,16
2016-07-06,20,21,22,23,pineapple,19


##### Setting by assigning with a numpy array

In [52]:
second_numpy_array = np.array(np.arange(len(sample_df_2)))  *100 + 7
second_numpy_array

array([  7, 107, 207, 307, 407, 507])

In [53]:
sample_df_2['G'] = second_numpy_array
sample_df_2

Unnamed: 0,A,B,C,D,Fruits,Extra Data,G
2016-07-01,0,1,2,3,apple,4,7
2016-07-02,4,5,6,7,orange,7,107
2016-07-03,8,9,10,11,banana,10,207
2016-07-04,12,13,4444,15,pear,13,307
2016-07-05,16,17,18,19,blueberry,16,407
2016-07-06,20,21,22,23,pineapple,19,507
