# Pandas(panel/table data)
- Much of it comes from the "R" statistical language
- based on numpy
- Can connect directly to databases
- Can read/write in many file formats
- very large package
- panda developers claim:
    - more functionality than R
    - faster algorithms than R
- the two primary classes are Series and DataFrame
- [doc](http://pandas.pydata.org)
- [cheat sheet](https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf)

In [1]:
# standard abbreviations

import numpy as np
import pandas as pd
import datetime
pd.__version__

'0.20.3'

# a pandas series has an index and one column of data

In [2]:
ser = pd.Series(range(10,15))

ser

0    10
1    11
2    12
3    13
4    14
dtype: int32

In [3]:
# can index 

ser[3]

13

In [4]:
# and slice

ser[:3]

0    10
1    11
2    12
dtype: int32

In [5]:
# index does not need to be numeric

ser = pd.Series(range(10,15), \
    index=['butler', 'math', \
           'science', 'avery', 'business'])
ser

butler      10
math        11
science     12
avery       13
business    14
dtype: int32

In [6]:
# slice
# note - inclusive, unlike list slice

ser['butler':'science']

butler     10
math       11
science    12
dtype: int32

In [7]:
ser

butler      10
math        11
science     12
avery       13
business    14
dtype: int32

In [8]:
# index

ser['science']

12

In [9]:
ser

butler      10
math        11
science     12
avery       13
business    14
dtype: int32

In [10]:
ser2 = pd.Series(range(20,25), \
    index=['butler', 'math', \
           'science', 'avery', 'business'])
ser2

butler      20
math        21
science     22
avery       23
business    24
dtype: int32

In [11]:
# can add series

ser + ser2

butler      30
math        32
science     34
avery       36
business    38
dtype: int32

In [12]:
# does the right thing with scalars

2 * ser + 3 * ser2 + 5

butler       85
math         90
science      95
avery       100
business    105
dtype: int32

In [13]:
ser

butler      10
math        11
science     12
avery       13
business    14
dtype: int32

In [14]:
# this has some different fields 

ser3 = pd.Series(range(10,14), \
         index=['math', 'science',\
                'avery', 'law'])
ser3

math       10
science    11
avery      12
law        13
dtype: int32

In [15]:
# business, butler, law indexes are only 
# defined in one of the summands, 
# so can't compute their sums

# hey, where did the floating point come from??

ssum=ser+ser3
ssum

avery       25.0
business     NaN
butler       NaN
law          NaN
math        21.0
science     23.0
dtype: float64

# addition of Series
- same index values are added together, even though indexes are in different order
- the sum index is the union of the indexes in both Series. 
- if there is not a value in both Series for an index, the value is the special IEEE floating point value NaN(Not a Number), which normally represents invalid floating point operations
- NaNs lets pandas represent missing values efficiently
- note that in order to use NaNs, the original 
integer values were converted to floats!

# real world data almost always has missing values - need to deal with it


In [16]:
# functions like mean are smart about NaN's
# they just skip NaN's, instead of raising errors

[ssum.mean(), (25+21+23)/3.]

[23.0, 23.0]

In [17]:
# call sin on each element
# don't raise an error on the NaN's
# sin(NaN) = NaN

np.sin(ssum)

avery      -0.132352
business         NaN
butler           NaN
law              NaN
math        0.836656
science    -0.846220
dtype: float64

In [18]:
# drop any row with a NaN

ssum.dropna()

avery      25.0
math       21.0
science    23.0
dtype: float64

In [19]:
ssum

avery       25.0
business     NaN
butler       NaN
law          NaN
math        21.0
science     23.0
dtype: float64

In [20]:
# can fill in missing vals

ssum.fillna(0)

avery       25.0
business     0.0
butler       0.0
law          0.0
math        21.0
science     23.0
dtype: float64

In [21]:
# can be nicer to interpolate missing values

ssum.interpolate()

avery       25.0
business    24.0
butler      23.0
law         22.0
math        21.0
science     23.0
dtype: float64

# Example - find prime numbers
- define findPrimes
- return a list of primes upto a given limit
- use [sieve of eratosthenes](https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes#example) algorithm
- use a Series with boolean data
- slices with increments can do most of the work




In [22]:
# find primes upto 20 
# if a bool is True, that number is prime

import pandas as pd
import numpy as np

ser = pd.Series(np.ones(20, dtype=np.bool))
ser

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
dtype: bool

In [23]:
# 0, 1 are not prime

ser[:2] = 0
ser

0     False
1     False
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
dtype: bool

In [26]:
for j in range(2, 20):
    ser[2*j::j] = 0
ser

0     False
1     False
2      True
3      True
4     False
5      True
6     False
7      True
8     False
9     False
10    False
11     True
12    False
13     True
14    False
15    False
16    False
17     True
18    False
19     True
dtype: bool

In [27]:
# but how do we get a list of the primes?



















# index myself!!

ser[ser]

2     True
3     True
5     True
7     True
11    True
13    True
17    True
19    True
dtype: bool

In [28]:
ser[ser].index

Int64Index([2, 3, 5, 7, 11, 13, 17, 19], dtype='int64')

In [29]:
def findPrimes(upto):
    # 1 = True
    ser = pd.Series(np.ones(upto, dtype=bool))
    # 0,1 not prime
    ser[:2] = False
    for j in range(2, upto):
        ser[2*j::j] = False
    return list(ser[ser].index)

In [30]:
findPrimes(100)

[2,
 3,
 5,
 7,
 11,
 13,
 17,
 19,
 23,
 29,
 31,
 37,
 41,
 43,
 47,
 53,
 59,
 61,
 67,
 71,
 73,
 79,
 83,
 89,
 97]

In [31]:
# faster 

import math

def findPrimes2(upto):
    # 1 = True
    ser = pd.Series(np.ones(upto, dtype=bool))
    # kill 0,1
    ser[:2] = False
    # kill all the evens
    ser[4::2] = False
    # now only need to check odd multiples 
    # not already touched,
    # up to square root of upto
    for j in range(3, math.ceil(math.sqrt(upto))+1, 2):
        if ser[j]:
            ser[2*j::j] = False
    return list(ser[ser].index)
                    

In [32]:
findPrimes2(100)

[2,
 3,
 5,
 7,
 11,
 13,
 17,
 19,
 23,
 29,
 31,
 37,
 41,
 43,
 47,
 53,
 59,
 61,
 67,
 71,
 73,
 79,
 83,
 89,
 97]

# DataFrame
- like a spreadsheet
    - labeled columns
    - rows of data
    - can also be thought of as a collection of Series with the same index

In [33]:
# make a DataFrame' from a CSV file
# by default frame displays first and last 30 lines

df = pd.read_csv('WHO_first9cols.csv')
df

FileNotFoundError: File b'WHO_first9cols.csv' does not exist

In [None]:
type(df)

In [None]:
# quick summary of the data

df.describe()

In [None]:
# len is number of rows

[df.shape, len(df), df.columns]

In [None]:
# pull a column out, you get a Series

cs= df['Country']
pop='Population (in thousands) total'
gross = 'Gross national income per capita (PPP international $)'
print(type(cs))
cs

In [None]:
# can slice a data frame

df[-2:]

In [None]:
# pull out a row by name
# df['Country'] == 'Bermuda' is a boolean series

df[df['Country'] == 'Bermuda']

In [None]:
# like linux 'head'

df.head(4)

In [None]:
# like linux 'tail' command

df.tail(4)

In [None]:
df.sort_values(by=pop, ascending=False)

In [None]:
# cumlative sum, ignoring NaN's

df[pop].cumsum()

In [None]:
# can add cumsum as a column

df['cumsum pop'] = df[pop].cumsum()
df.head()

In [None]:
# column name

lit = 'Adult literacy rate (%)'

# pull out info about the column

[df[lit].min(), df[lit].max(), \
 df[lit].mean(), df[lit].std()]

In [None]:
# make a boolean array - true if literacy>mean

print(df[lit].mean())
ba = df[lit] > df[lit].mean()
ba[:9]

In [None]:
# select countries with literacy rate greater than the mean
# one liner

df[ df[lit] > df[lit].mean()]

In [None]:
# make a new data frame with the selected rows

df2 = df[ df[lit] > df[lit].mean()]
df2

In [None]:
# make new frame with lit> mean, and gross> 11000

# might try this, but doesn't work...

df[ df[lit] > df[lit].mean() and df[pop] > 11000]

In [None]:
# ...oddly, panda expressions use '&', '|', 
# instead of 'and' and 'or'

df[ (df[lit] > df[lit].mean()) & (df[pop] > 11000)]

# date and time functionality
- panadas has quite a bit of it
- very popular in finance
- [doc](http://pandas.pydata.org/pandas-docs/stable/timeseries.html)
- [date_range frequencies](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries-offset-aliases)

In [34]:
# Series and DataFrame can have datetime indexes

dr = pd.date_range('2017-10-02', '2017-10-31', \
                   freq="B")
dr

DatetimeIndex(['2017-10-02', '2017-10-03', '2017-10-04', '2017-10-05',
               '2017-10-06', '2017-10-09', '2017-10-10', '2017-10-11',
               '2017-10-12', '2017-10-13', '2017-10-16', '2017-10-17',
               '2017-10-18', '2017-10-19', '2017-10-20', '2017-10-23',
               '2017-10-24', '2017-10-25', '2017-10-26', '2017-10-27',
               '2017-10-30', '2017-10-31'],
              dtype='datetime64[ns]', freq='B')

In [35]:

pd.Series(range(len(dr)), index=dr)

2017-10-02     0
2017-10-03     1
2017-10-04     2
2017-10-05     3
2017-10-06     4
2017-10-09     5
2017-10-10     6
2017-10-11     7
2017-10-12     8
2017-10-13     9
2017-10-16    10
2017-10-17    11
2017-10-18    12
2017-10-19    13
2017-10-20    14
2017-10-23    15
2017-10-24    16
2017-10-25    17
2017-10-26    18
2017-10-27    19
2017-10-30    20
2017-10-31    21
Freq: B, dtype: int32

# to access/modify individual dataframe elements, use 'indexers'
- 'iloc' - address using integers
- 'loc' - address using names
- 'ix' - deprecated, do not use

In [36]:
df = pd.DataFrame({'foo':[0,1,2,3], \
                   'bar':[30,31,32,33]},
        index=pd.date_range('2017-10-02',\
                            '2017-10-31', freq="W"))
df

Unnamed: 0,bar,foo
2017-10-08,30,0
2017-10-15,31,1
2017-10-22,32,2
2017-10-29,33,3


In [37]:
# gets the row as a Series
# a bit inefficient...

df.loc[datetime.date(2017,10,15)]

bar    31
foo     1
Name: 2017-10-15 00:00:00, dtype: int64

In [38]:
df.loc[datetime.date(2017,10,15)].bar

31

In [39]:
# ...get the specific element by itself 
# specify [row, col] by names

df.loc[datetime.date(2017,10,15), 'bar']

31

In [40]:
# modify it

df.loc[datetime.date(2017,10,15), 'bar'] = 1234
df

Unnamed: 0,bar,foo
2017-10-08,30,0
2017-10-15,1234,1
2017-10-22,32,2
2017-10-29,33,3


In [41]:
# row as Series

df.iloc[2]

bar    32
foo     2
Name: 2017-10-22 00:00:00, dtype: int64

In [42]:
# specify element at [row,col] by row and col integers

df.iloc[2,1]

2

In [43]:
# modify it

df.iloc[2,1] = 32043
df

Unnamed: 0,bar,foo
2017-10-08,30,0
2017-10-15,1234,1
2017-10-22,32,32043
2017-10-29,33,3


# two sample datasets below to play with

In [44]:
import seaborn as sns 

planets = sns.load_dataset('planets')
planets


Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.100,77.40,2006
1,Radial Velocity,1,874.774000,2.210,56.95,2008
2,Radial Velocity,1,763.000000,2.600,19.84,2011
3,Radial Velocity,1,326.030000,19.400,110.62,2007
4,Radial Velocity,1,516.220000,10.500,119.47,2009
5,Radial Velocity,1,185.840000,4.800,76.39,2008
6,Radial Velocity,1,1773.400000,4.640,18.15,2002
7,Radial Velocity,1,798.500000,,21.41,1996
8,Radial Velocity,1,993.300000,10.300,73.10,2008
9,Radial Velocity,2,452.800000,1.990,74.79,2010


In [45]:
titanic = sns.load_dataset('titanic')
titanic


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
