In [2]:
import pandas as pd
from pandas_datareader import data

 - Series - 1D Data structure
 - Dataframe - 2D Data structure
 - Panel - 3D Data structure
   - A container of dataframes
   - Need to know 3 things: Data frame in panel to get data from, points in the dataframe to get specific value
 

In [4]:
companies = ["MSFT", "GOOG", "AAPL", "YHOO", "AMZN"]
p = data.DataReader(companies, data_source = 'google', start = '2010-01-01', end='2016-12-31')
p 
# This returns a panel object

<class 'pandas.core.panel.Panel'>
Dimensions: 5 (items) x 1745 (major_axis) x 5 (minor_axis)
Items axis: Open to Volume
Major_axis axis: 2010-01-04 00:00:00 to 2016-12-07 00:00:00
Minor_axis axis: AAPL to YHOO

# Section 11; Part 147
The axes of a panel object

 - `Dimensions` - Tells how many dataframes are in the panel (5 in panel above). It is **NOT** each of the 5 tech companies In each dataframe there are 1745 rows and 5 columns in each (the colums, in this case the companies). 
 - `Items Axis` - Outer layer of panel
 - `Major Axis` - The rows in each dataframe
 - `Minor Axis` - The columns of each dataframe (in this case, the companies)
 - `axes` - All of the axes in a single result

In [5]:
p.items

Index([u'Open', u'High', u'Low', u'Close', u'Volume'], dtype='object')

In [6]:
p.major_axis

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2016-11-23', '2016-11-25', '2016-11-28', '2016-11-29',
               '2016-11-30', '2016-12-01', '2016-12-02', '2016-12-05',
               '2016-12-06', '2016-12-07'],
              dtype='datetime64[ns]', name=u'Date', length=1745, freq=None)

In [7]:
p.minor_axis

Index([u'AAPL', u'AMZN', u'GOOG', u'MSFT', u'YHOO'], dtype='object')

In [8]:
p.axes

[Index([u'Open', u'High', u'Low', u'Close', u'Volume'], dtype='object'),
 DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
                '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
                '2010-01-14', '2010-01-15',
                ...
                '2016-11-23', '2016-11-25', '2016-11-28', '2016-11-29',
                '2016-11-30', '2016-12-01', '2016-12-02', '2016-12-05',
                '2016-12-06', '2016-12-07'],
               dtype='datetime64[ns]', name=u'Date', length=1745, freq=None),
 Index([u'AAPL', u'AMZN', u'GOOG', u'MSFT', u'YHOO'], dtype='object')]

# Section 11; Part 148
Panel Attributes

In [10]:
# ndim - Number of dimensions (panel is 3)
p.ndim

3

In [11]:
# dtypes - Series of index labels of each data frame (the .items attribute)
p.dtypes

Open      float64
High      float64
Low       float64
Close     float64
Volume    float64
dtype: object

In [13]:
# shape - Tuple of the measurements of the panel
# (number of dataframe, number of rows, number of columns)
p.shape

(5, 1745, 5)

In [14]:
# size - Total number of values in the panel
p.size

43625

In [15]:
# Can prove this value by multiplying values in .shape tuple
p.shape[0] * p.shape[1] * p.shape[2]

43625

In [16]:
# values - Array of values
#  Nested structure
p.values

array([[[  3.04900000e+01,   1.36250000e+02,   3.13160000e+02,
           3.06200000e+01,   1.69400000e+01],
        [  3.06600000e+01,   1.33430000e+02,   3.13280000e+02,
           3.08500000e+01,   1.72200000e+01],
        [  3.06300000e+01,   1.34600000e+02,   3.12620000e+02,
           3.08800000e+01,   1.71700000e+01],
        ..., 
        [  1.10000000e+02,   7.45000000e+02,   7.57710000e+02,
           5.97000000e+01,   4.00200000e+01],
        [  1.09500000e+02,   7.63990000e+02,   7.64730000e+02,
           6.04300000e+01,   4.03100000e+01],
        [  1.09260000e+02,   7.64550000e+02,   7.61000000e+02,
           6.00100000e+01,   3.99800000e+01]],

       [[  3.06400000e+01,   1.36610000e+02,   3.14440000e+02,
           3.11000000e+01,   1.72000000e+01],
        [  3.08000000e+01,   1.35480000e+02,   3.13610000e+02,
           3.11000000e+01,   1.72300000e+01],
        [  3.07500000e+01,   1.34730000e+02,   3.12620000e+02,
           3.10800000e+01,   1.73000000e+01],
   

# Section 11; Part 149
Use bracket notation to extract a dataframe (for subsetting)

 - Brackets are used to extract the next smallest datastructure
   - In Series, it extracts a value
   - In Dataframe, it extracts a column (which is a Series)
   - In Panel, it extracts a dataframe
 - To extract a dataframe, need to pass an item in the items axis via the bracket notation

In [17]:
# Get available values on the items axis
p.items

Index([u'Open', u'High', u'Low', u'Close', u'Volume'], dtype='object')

In [18]:
# Pull the Open dataframe
p['Open']

Unnamed: 0_level_0,AAPL,AMZN,GOOG,MSFT,YHOO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.49,136.25,313.16,30.62,16.94
2010-01-05,30.66,133.43,313.28,30.85,17.22
2010-01-06,30.63,134.60,312.62,30.88,17.17
2010-01-07,30.25,132.01,304.40,30.63,16.81
2010-01-08,30.04,130.56,295.70,30.28,16.68
2010-01-11,30.40,132.62,301.93,30.71,16.77
2010-01-12,29.88,128.99,298.74,30.15,16.65
2010-01-13,29.70,127.90,287.96,30.26,16.88
2010-01-14,30.02,129.14,291.66,30.31,16.81
2010-01-15,30.13,129.18,295.50,31.08,17.25


In [19]:
# Pull the Volume dataframe
p['Volume']

Unnamed: 0_level_0,AAPL,AMZN,GOOG,MSFT,YHOO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,123432050.0,7600543.0,,38414185.0,16588957.0
2010-01-05,150476004.0,8856456.0,,49758862.0,11718126.0
2010-01-06,138039594.0,7180977.0,,58182332.0,16421960.0
2010-01-07,119282324.0,11030124.0,,50564285.0,31816301.0
2010-01-08,111969081.0,9833829.0,,51201289.0,15471074.0
2010-01-11,115557365.0,8786668.0,,68754648.0,16185267.0
2010-01-12,148614774.0,9098190.0,,65913228.0,15674461.0
2010-01-13,151472335.0,10727856.0,,51863463.0,16961731.0
2010-01-14,108288411.0,9788435.0,,63244767.0,16717061.0
2010-01-15,148584065.0,15382763.0,,79915648.0,18415868.0


Alternate syntax is to use the dot notation, but it has the same drawbacks as other places it is availabe (namely, spaces in the items will break it

In [20]:
p.Volume

Unnamed: 0_level_0,AAPL,AMZN,GOOG,MSFT,YHOO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,123432050.0,7600543.0,,38414185.0,16588957.0
2010-01-05,150476004.0,8856456.0,,49758862.0,11718126.0
2010-01-06,138039594.0,7180977.0,,58182332.0,16421960.0
2010-01-07,119282324.0,11030124.0,,50564285.0,31816301.0
2010-01-08,111969081.0,9833829.0,,51201289.0,15471074.0
2010-01-11,115557365.0,8786668.0,,68754648.0,16185267.0
2010-01-12,148614774.0,9098190.0,,65913228.0,15674461.0
2010-01-13,151472335.0,10727856.0,,51863463.0,16961731.0
2010-01-14,108288411.0,9788435.0,,63244767.0,16717061.0
2010-01-15,148584065.0,15382763.0,,79915648.0,18415868.0


# Section 11; Part 150
Extracting with `loc`, `iloc` and `ix` Methods

In [21]:
# Extract using index labels (loc)
# Extracting close dataframe
p.loc["Close"]

Unnamed: 0_level_0,AAPL,AMZN,GOOG,MSFT,YHOO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.57,133.90,313.06,30.95,17.10
2010-01-05,30.63,134.69,311.68,30.96,17.23
2010-01-06,30.14,132.25,303.83,30.77,17.17
2010-01-07,30.08,130.00,296.75,30.45,16.70
2010-01-08,30.28,133.52,300.71,30.66,16.70
2010-01-11,30.02,130.31,300.25,30.27,16.74
2010-01-12,29.67,127.35,294.94,30.07,16.68
2010-01-13,30.09,129.11,293.25,30.35,16.90
2010-01-14,29.92,127.35,294.63,30.96,17.12
2010-01-15,29.42,127.14,289.71,30.86,16.82


In [22]:
# Get Close values for a specific date
p.loc["Close", '2010-01-04']

AAPL     30.57
AMZN    133.90
GOOG    313.06
MSFT     30.95
YHOO     17.10
Name: 2010-01-04 00:00:00, dtype: float64

In [23]:
# Get close value for MSFT on 2010-01-04
p.loc["Close", '2010-01-04', "MSFT"]

30.949999999999999

In [24]:
# Extract using index position (iloc)
p.items

Index([u'Open', u'High', u'Low', u'Close', u'Volume'], dtype='object')

In [25]:
# Get close (3rd index)
p.iloc[3]

Unnamed: 0_level_0,AAPL,AMZN,GOOG,MSFT,YHOO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.57,133.90,313.06,30.95,17.10
2010-01-05,30.63,134.69,311.68,30.96,17.23
2010-01-06,30.14,132.25,303.83,30.77,17.17
2010-01-07,30.08,130.00,296.75,30.45,16.70
2010-01-08,30.28,133.52,300.71,30.66,16.70
2010-01-11,30.02,130.31,300.25,30.27,16.74
2010-01-12,29.67,127.35,294.94,30.07,16.68
2010-01-13,30.09,129.11,293.25,30.35,16.90
2010-01-14,29.92,127.35,294.63,30.96,17.12
2010-01-15,29.42,127.14,289.71,30.86,16.82


In [26]:
# Get 2010-01-04 (0) in close dataframe 
p.iloc[3, 0]

AAPL     30.57
AMZN    133.90
GOOG    313.06
MSFT     30.95
YHOO     17.10
Name: 2010-01-04 00:00:00, dtype: float64

In [27]:
# Get MSFT (3rd index) close value on date above 
p.iloc[3, 0, 3]

30.949999999999999

In [28]:
# Mix/match position and labels using .ix
p.ix["Close", 0, "GOOG"]

313.06

# Section 11; Part 151
Convert panel into a multiindex dataframe (and reverse)

In [31]:
# Convert panel to multi-index dataframe
df = p.to_frame()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Date,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,AAPL,30.49,30.64,30.34,30.57,123432050.0
2010-01-04,AMZN,136.25,136.61,133.14,133.9,7600543.0
2010-01-04,MSFT,30.62,31.1,30.59,30.95,38414185.0
2010-01-04,YHOO,16.94,17.2,16.88,17.1,16588957.0
2010-01-05,AAPL,30.66,30.8,30.46,30.63,150476004.0


 - Items axis became column headers
 - Major axis became outer index
 - Minor axis became inner index

In [33]:
# Convert multi-index dataframe to a panel
p2 = df.to_panel()
p2

<class 'pandas.core.panel.Panel'>
Dimensions: 5 (items) x 1745 (major_axis) x 5 (minor_axis)
Items axis: Open to Volume
Major_axis axis: 2010-01-04 00:00:00 to 2016-12-07 00:00:00
Minor_axis axis: AAPL to YHOO

# Section 11; Part 152
The `major_xs()` method

 - Will remove the major_axis and return a dataframe for that specific major axis value

In [34]:
# Have the major_axis *attribute* - Returns values that are the row labels in the panel
p.major_axis

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2016-11-23', '2016-11-25', '2016-11-28', '2016-11-29',
               '2016-11-30', '2016-12-01', '2016-12-02', '2016-12-05',
               '2016-12-06', '2016-12-07'],
              dtype='datetime64[ns]', name=u'Date', length=1745, freq=None)

In [36]:
# major_xs() is a *method* - Returns a dataframe where the minor access is the Index and the Items axis are the columns

#  Get data for 2010-01-04
p.major_xs("2010-01-04")

Unnamed: 0,Open,High,Low,Close,Volume
AAPL,30.49,30.64,30.34,30.57,123432050.0
AMZN,136.25,136.61,133.14,133.9,7600543.0
GOOG,313.16,314.44,311.81,313.06,
MSFT,30.62,31.1,30.59,30.95,38414185.0
YHOO,16.94,17.2,16.88,17.1,16588957.0


# Section 11; Part 153
The `minor_xs()` method

 - Remove the minor_axis and return a dataframe for that specific minor axis value

In [37]:
p.minor_axis

Index([u'AAPL', u'AMZN', u'GOOG', u'MSFT', u'YHOO'], dtype='object')

In [39]:
# Get all of AAPL's stock values
p.minor_xs('AAPL')

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.49,30.64,30.34,30.57,123432050.0
2010-01-05,30.66,30.80,30.46,30.63,150476004.0
2010-01-06,30.63,30.75,30.11,30.14,138039594.0
2010-01-07,30.25,30.29,29.86,30.08,119282324.0
2010-01-08,30.04,30.29,29.87,30.28,111969081.0
2010-01-11,30.40,30.43,29.78,30.02,115557365.0
2010-01-12,29.88,29.97,29.49,29.67,148614774.0
2010-01-13,29.70,30.13,29.16,30.09,151472335.0
2010-01-14,30.02,30.07,29.86,29.92,108288411.0
2010-01-15,30.13,30.23,29.41,29.42,148584065.0


# Section 11; Part 154
Transpose a panel with `.transpose()`

 - Swap the axes that make up a panel object

In [40]:
p.axes

[Index([u'Open', u'High', u'Low', u'Close', u'Volume'], dtype='object'),
 DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
                '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
                '2010-01-14', '2010-01-15',
                ...
                '2016-11-23', '2016-11-25', '2016-11-28', '2016-11-29',
                '2016-11-30', '2016-12-01', '2016-12-02', '2016-12-05',
                '2016-12-06', '2016-12-07'],
               dtype='datetime64[ns]', name=u'Date', length=1745, freq=None),
 Index([u'AAPL', u'AMZN', u'GOOG', u'MSFT', u'YHOO'], dtype='object')]

In [42]:
# Swap Stock information (Open/Close/High/Low/Volume) with the Company information
#   This is the items axis and the minor axis
# Feed it a list of indexes the new panel will have in the order we want 

# Items - Index position of 0
# Major - Index position of 1
# Minor - Index position of 2
p2 = p.transpose(2, 1 ,0)
p2

<class 'pandas.core.panel.Panel'>
Dimensions: 5 (items) x 1745 (major_axis) x 5 (minor_axis)
Items axis: AAPL to YHOO
Major_axis axis: 2010-01-04 00:00:00 to 2016-12-07 00:00:00
Minor_axis axis: Open to Volume

In [43]:
# Now can get company information like this, instead of using the .minor_xs() method
p2['AAPL']

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.49,30.64,30.34,30.57,123432050.0
2010-01-05,30.66,30.80,30.46,30.63,150476004.0
2010-01-06,30.63,30.75,30.11,30.14,138039594.0
2010-01-07,30.25,30.29,29.86,30.08,119282324.0
2010-01-08,30.04,30.29,29.87,30.28,111969081.0
2010-01-11,30.40,30.43,29.78,30.02,115557365.0
2010-01-12,29.88,29.97,29.49,29.67,148614774.0
2010-01-13,29.70,30.13,29.16,30.09,151472335.0
2010-01-14,30.02,30.07,29.86,29.92,108288411.0
2010-01-15,30.13,30.23,29.41,29.42,148584065.0


In [44]:
# Using major_xs() and minor_xs() still works
# Get information for all on a specific day
p2.major_xs('2010-01-04')

Unnamed: 0,AAPL,AMZN,GOOG,MSFT,YHOO
Open,30.49,136.25,313.16,30.62,16.94
High,30.64,136.61,314.44,31.1,17.2
Low,30.34,133.14,311.81,30.59,16.88
Close,30.57,133.9,313.06,30.95,17.1
Volume,123432000.0,7600543.0,,38414185.0,16588957.0


In [45]:
# Get all volumes
p2.minor_xs("Volume")

Unnamed: 0_level_0,AAPL,AMZN,GOOG,MSFT,YHOO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,123432050.0,7600543.0,,38414185.0,16588957.0
2010-01-05,150476004.0,8856456.0,,49758862.0,11718126.0
2010-01-06,138039594.0,7180977.0,,58182332.0,16421960.0
2010-01-07,119282324.0,11030124.0,,50564285.0,31816301.0
2010-01-08,111969081.0,9833829.0,,51201289.0,15471074.0
2010-01-11,115557365.0,8786668.0,,68754648.0,16185267.0
2010-01-12,148614774.0,9098190.0,,65913228.0,15674461.0
2010-01-13,151472335.0,10727856.0,,51863463.0,16961731.0
2010-01-14,108288411.0,9788435.0,,63244767.0,16717061.0
2010-01-15,148584065.0,15382763.0,,79915648.0,18415868.0


# Section 11; Part 155
The `swapaxes()` Method

 - Alternative to .transpose() without need for knowing index positions of each axes

In [47]:
# Swap Stock information (Open/Close/High/Low/Volume) with the Company information
p3 = p.swapaxes("items", "minor")
p3

<class 'pandas.core.panel.Panel'>
Dimensions: 5 (items) x 1745 (major_axis) x 5 (minor_axis)
Items axis: AAPL to YHOO
Major_axis axis: 2010-01-04 00:00:00 to 2016-12-07 00:00:00
Minor_axis axis: Open to Volume

In [48]:
p3['MSFT']

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,30.62,31.10,30.59,30.95,38414185.0
2010-01-05,30.85,31.10,30.64,30.96,49758862.0
2010-01-06,30.88,31.08,30.52,30.77,58182332.0
2010-01-07,30.63,30.70,30.19,30.45,50564285.0
2010-01-08,30.28,30.88,30.24,30.66,51201289.0
2010-01-11,30.71,30.76,30.12,30.27,68754648.0
2010-01-12,30.15,30.40,29.91,30.07,65913228.0
2010-01-13,30.26,30.52,30.01,30.35,51863463.0
2010-01-14,30.31,31.10,30.26,30.96,63244767.0
2010-01-15,31.08,31.24,30.71,30.86,79915648.0
