In [None]:
"""
Introduction To Pandas





Main objects: Series & DataFrame

Description: like relational database tables in python

Details: 
    1) built on top of Numpy
    2) All the data does not need to be of the same type!
"""

In [None]:
"""
Outline




A) Series

B) DataFrame

C) Upload Data

D) View Data

E) Indexing

F) Iterating

G) Reshaping & Sorting

H) Join & Group
"""

In [1]:
import pandas as pd
import numpy as np

In [15]:
"""

A) Series

"""

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int32

In [None]:
# pass list, tuple, or array & pandas makes the indeces automatically
future_array1 = [1,2,3,4,5,6] # 1 dimensional array
array1 = np.array(future_array1)
s = pd.Series(array1)
s

In [None]:
"""

B) DataFrame

"""

In [14]:
# DataFrame: object that can contain more than 1 Series
dates = pd.date_range("20160101", periods=6)
data = np.random.random((6,3))
df = pd.DataFrame(data, index=dates, columns=['Column1', 'Column2', 'Column3'])
df

Unnamed: 0,Column1,Column2,Column3
2016-01-01,0.877179,0.138904,0.535077
2016-01-02,0.163296,0.67622,0.746911
2016-01-03,0.952653,0.986626,0.262589
2016-01-04,0.677718,0.308529,0.014202
2016-01-05,0.624073,0.500785,0.72094
2016-01-06,0.642954,0.85439,0.124858


In [20]:
# DataFrames, unlike Arrays, can store elements of different data types!
df['Column4'] = pd.Series([True, False, False, False, True, True], index=dates)
df

Unnamed: 0,Column1,Column2,Column3,Column4
2016-01-01,0.877179,0.138904,0.535077,True
2016-01-02,0.163296,0.67622,0.746911,False
2016-01-03,0.952653,0.986626,0.262589,False
2016-01-04,0.677718,0.308529,0.014202,False
2016-01-05,0.624073,0.500785,0.72094,True
2016-01-06,0.642954,0.85439,0.124858,True


In [None]:
"""

C) Upload Data

"""

In [None]:
# There's a lot of ways to upload, but let's just focus on csv files for now, it's easy!
uploaded_data = pd.read_csv("filename.csv", index_col=0)

In [None]:
"""

D) View Data

"""

In [22]:
# View the top of the data set
df.head(3) # <- can leave it (), defaults to 5

Unnamed: 0,Column1,Column2,Column3,Column4
2016-01-01,0.877179,0.138904,0.535077,True
2016-01-02,0.163296,0.67622,0.746911,False
2016-01-03,0.952653,0.986626,0.262589,False


In [23]:
# View the bottom of the data set
df.tail(3) # <- can leave it (), defaults to 5

Unnamed: 0,Column1,Column2,Column3,Column4
2016-01-04,0.677718,0.308529,0.014202,False
2016-01-05,0.624073,0.500785,0.72094,True
2016-01-06,0.642954,0.85439,0.124858,True


In [25]:
# View each piece of the DataFrame
print df.index
print ""
print df.columns
print ""
print df.values

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

Index([u'Column1', u'Column2', u'Column3', u'Column4'], dtype='object')

[[0.8771792804941592 0.13890420834456496 0.5350768696858543 True]
 [0.16329580261900212 0.6762197129011865 0.7469106669341132 False]
 [0.9526533619038046 0.9866263135399093 0.26258877341837383 False]
 [0.677717788786839 0.308528960913394 0.014202026384788291 False]
 [0.6240733160160035 0.5007847856873302 0.7209397721805229 True]
 [0.6429538580198717 0.854390470136722 0.12485815605528039 True]]


In [28]:
# View description statistics
print df.describe()

        Column1   Column2   Column3
count  6.000000  6.000000  6.000000
mean   0.656312  0.577576  0.400763
std    0.276167  0.323850  0.311454
min    0.163296  0.138904  0.014202
25%    0.628793  0.356593  0.159291
50%    0.660336  0.588502  0.398833
75%    0.827314  0.809848  0.674474
max    0.952653  0.986626  0.746911


In [None]:
"""

E) Indexing

"""

In [34]:
# Column names
df['Column2']

2016-01-01    0.138904
2016-01-02    0.676220
2016-01-03    0.986626
2016-01-04    0.308529
2016-01-05    0.500785
2016-01-06    0.854390
Freq: D, Name: Column2, dtype: float64

In [38]:
# Row Indeces
df[0:2] # or df['20160101':'20160102']

Unnamed: 0,Column1,Column2,Column3,Column4
2016-01-01,0.877179,0.138904,0.535077,True
2016-01-02,0.163296,0.67622,0.746911,False


In [46]:
# Multi-axis with label
df.loc['20160101':'20160102',['Column1','Column3']]

Unnamed: 0,Column1,Column3
2016-01-01,0.877179,0.535077
2016-01-02,0.163296,0.746911


In [48]:
# Multi-axis with indexing
df.iloc[3:5, 0:2]

Unnamed: 0,Column1,Column2
2016-01-04,0.677718,0.308529
2016-01-05,0.624073,0.500785


In [49]:
# Boolean
df[df.Column1 > .5]

Unnamed: 0,Column1,Column2,Column3,Column4
2016-01-01,0.877179,0.138904,0.535077,True
2016-01-03,0.952653,0.986626,0.262589,False
2016-01-04,0.677718,0.308529,0.014202,False
2016-01-05,0.624073,0.500785,0.72094,True
2016-01-06,0.642954,0.85439,0.124858,True


In [None]:
"""

F) Iterating

"""

In [52]:
for index, row in df.iterrows():
    print index, row[0]

2016-01-01 00:00:00 0.877179280494
2016-01-02 00:00:00 0.163295802619
2016-01-03 00:00:00 0.952653361904
2016-01-04 00:00:00 0.677717788787
2016-01-05 00:00:00 0.624073316016
2016-01-06 00:00:00 0.64295385802


In [None]:
"""

G) Reshaping & Sorting

"""

In [30]:
# Transpose
df.T

Unnamed: 0,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00
Column1,0.877179,0.163296,0.952653,0.677718,0.624073,0.642954
Column2,0.138904,0.67622,0.986626,0.308529,0.500785,0.85439
Column3,0.535077,0.746911,0.262589,0.014202,0.72094,0.124858
Column4,True,False,False,False,True,True


In [32]:
# Sort by Index
df.sort_index(axis=0, ascending=False)

Unnamed: 0,Column1,Column2,Column3,Column4
2016-01-06,0.642954,0.85439,0.124858,True
2016-01-05,0.624073,0.500785,0.72094,True
2016-01-04,0.677718,0.308529,0.014202,False
2016-01-03,0.952653,0.986626,0.262589,False
2016-01-02,0.163296,0.67622,0.746911,False
2016-01-01,0.877179,0.138904,0.535077,True


In [33]:
# Sort by Values
df.sort_values(by='Column2')

Unnamed: 0,Column1,Column2,Column3,Column4
2016-01-01,0.877179,0.138904,0.535077,True
2016-01-04,0.677718,0.308529,0.014202,False
2016-01-05,0.624073,0.500785,0.72094,True
2016-01-02,0.163296,0.67622,0.746911,False
2016-01-06,0.642954,0.85439,0.124858,True
2016-01-03,0.952653,0.986626,0.262589,False


In [None]:
"""

H) Join & Group

"""

In [63]:
# Join
data1 = np.random.random((6,2))
data2 = np.random.random((6,2))
df1 = pd.DataFrame(data1, index=dates, columns=['ColumnA', 'ColumnB'])
df2 = pd.DataFrame(data2, index=dates, columns=['ColumnC', 'ColumnD'])
df1.join(df2) # joins on the index, you can join on other columns with: pd.merge(df1, df2, on="column_name", how='left')

Unnamed: 0,ColumnA,ColumnB,ColumnC,ColumnD
2016-01-01,0.355089,0.420808,0.256223,0.499482
2016-01-02,0.095692,0.058396,0.7788,0.736791
2016-01-03,0.315886,0.616283,0.936973,0.501672
2016-01-04,0.2852,0.720212,0.947503,0.725805
2016-01-05,0.043504,0.713114,0.912118,0.461885
2016-01-06,0.930846,0.268288,0.100112,0.855001


In [69]:
# Group
df.groupby('Column4').corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,Column1,Column2,Column3
Column4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,Column1,1.0,0.296913,-0.771258
False,Column2,0.296913,1.0,0.378822
False,Column3,-0.771258,0.378822,1.0
True,Column1,1.0,-0.834305,0.146446
True,Column2,-0.834305,1.0,-0.66754
True,Column3,0.146446,-0.66754,1.0


In [None]:
"""

More Resources

http://pandas.pydata.org/pandas-docs/stable/10min.html#min

http://pandas.pydata.org/pandas-docs/stable/tutorials.html

"""