## Shortcuts for storing modules

In [5]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

# Arrays using numpy

## Make an array (numpy) and perform simple math operations

In [16]:
# Make arrays from random numbers
aa = np.random.randn(10)
ab = 2*np.random.randn(10)

In [18]:
# Add the arrays together
np.add(aa,ab)

array([ 0.43779447,  1.64137475,  2.04881804, -2.06079191, -2.86787099,
       -1.89982058,  4.80373475, -4.07155319,  1.14069656, -1.59235052])

In [19]:
# Modify array where the values are less than 0 make -10
np.where(ab < 0, -10, ab)

array([-10.        ,   1.27986027,   2.38580663, -10.        ,
       -10.        , -10.        ,   5.03213798, -10.        ,
       -10.        , -10.        ])

# Series using Pandas

In [24]:
# Make a number of Series objects from a list
ser1 = Series(np.arange(5),
             index=['A','B','C','D','E'])

ser2 = Series(np.arange(5,11),
             index=['A','B','C','D','E','F'])


In [25]:
# Show values greater than a number in a Series
ser1[ser1>2]

D    3
E    4
dtype: int64

In [28]:
# Determine if an index is in a Series
print 'F' in ser1
print 'F' in ser2

False
True


In [31]:
# Add two series based on index from both - Nan is returned for indexes that don't match
ser3 = ser1+ser2
ser3

A     5
B     7
C     9
D    11
E    13
F   NaN
dtype: float64

In [35]:
# Determine if null values exist in a Series
pd.isnull(ser3)

A    False
B    False
C    False
D    False
E    False
F     True
dtype: bool

In [71]:
# Drop an entry for a Series
print ser3
ser3.drop('C')

A     5
B     7
C     9
D    11
E    13
F   NaN
dtype: float64


A     5
B     7
D    11
E    13
F   NaN
dtype: float64

In [81]:
# Select entries
# By index
ser1['B']

# By Series index
ser2[0:2]

# By logic
ser1[ser1>2]

D    3
E    4
dtype: int64

In [94]:
# Sort by values in a series
ser3.sort_values(ascending=False)

E    13
D    11
C     9
B     7
A     5
F   NaN
dtype: float64

In [96]:
# Sort by indexes in a Series
ser3.sort_index(ascending=False)

F   NaN
E    13
D    11
C     9
B     7
A     5
dtype: float64

In [97]:
# Find the rank for the values in a Series
ser3.rank()

A     1
B     2
C     3
D     4
E     5
F   NaN
dtype: float64

In [114]:
# Identify the unique values in a series
ser3.unique()

array([  5.,   7.,   9.,  11.,  13.,  nan])

In [115]:
# Count the number of each value in a series
ser3.value_counts()

7     1
13    1
5     1
11    1
9     1
dtype: int64

In [118]:
# Remove null values from a series
ser3.dropna()

A     5
B     7
C     9
D    11
E    13
dtype: float64

# DataFrames

## How to make a DataFrame

In [6]:
# DataFrame from a dictionary
df = DataFrame({'col1':[1,2,3,4],
               'col2':[10,20,30,40]},
              index=['SF', 'LA', 'DEN', 'BUF'])
df

Unnamed: 0,col1,col2
SF,1,10
LA,2,20
DEN,3,30
BUF,4,40


In [9]:
# Make dataframe from arange
df2 = DataFrame(np.arange(9).reshape(3,3),
               columns=['c1','c2','c3'],
               index=['i1','i2','i3'])
df2

Unnamed: 0,c1,c2,c3
i1,0,1,2
i2,3,4,5
i3,6,7,8


## Show head and tail of dataframe

In [45]:
# Head and tail
print df.head()
print df.tail()

     col1  col2
SF      1    10
LA      2    20
DEN     3    30
BUF     4    40
     col1  col2
SF      1    10
LA      2    20
DEN     3    30
BUF     4    40


## Transpose dataframes

In [11]:
# Use .T method
df.T

Unnamed: 0,SF,LA,DEN,BUF
col1,1,2,3,4
col2,10,20,30,40


## Check / modify column and indexes

In [19]:
# Check columns
df.columns

Index([u'col1', u'col2'], dtype='object')

In [17]:
# Check index
df.index

Index([u'SF', u'LA', u'DEN', u'BUF'], dtype='object')

## Select a specific column

In [38]:
# Using a df. method
df.col1

SF     1
LA     2
DEN    3
BUF    4
Name: col1, dtype: int64

In [39]:
# Using a subset
df['col1']

SF     1
LA     2
DEN    3
BUF    4
Name: col1, dtype: int64

In [41]:
# Make a new dataframe using subsetting
DataFrame(df, columns=['col1'])

Unnamed: 0,col1
SF,1
LA,2
DEN,3
BUF,4


## Select rows based on index

In [52]:
# Show indexes of a dataframe
df.index

Index([u'SF', u'LA', u'DEN', u'BUF'], dtype='object')

In [54]:
# Modify index for a dataframe
df.index = ['SF', 'LA', 'DEN', 'PITT']
df.index

Index([u'SF', u'LA', u'DEN', u'PITT'], dtype='object')

In [61]:
# Another way to reindex using reindex method - add rows or columns
df.reindex(index=['SF','LA','DEN','PITT','BUF', 'DC'],
           columns=['col1','col2','key1','key2'])

Unnamed: 0,col1,col2,key1,key2
SF,1.0,10.0,,
LA,2.0,20.0,,
DEN,3.0,30.0,,
PITT,4.0,40.0,,
BUF,,,,
DC,,,,


In [66]:
# A third way to reindex columns and rows
df.ix[['SF', 'LA', 'PITT','A','B'], ['col1','col2','n1','n2']]

Unnamed: 0,col1,col2,n1,n2
SF,1.0,10.0,,
LA,2.0,20.0,,
PITT,4.0,40.0,,
A,,,,
B,,,,


## Drop an index or a row

In [72]:
# Show df
df

Unnamed: 0,col1,col2
SF,1,10
LA,2,20
DEN,3,30
PITT,4,40


In [73]:
# Drop based on an axis (0 is row; 1 is column)
df.drop('SF', axis=0)

Unnamed: 0,col1,col2
LA,2,20
DEN,3,30
PITT,4,40


## Select entries in a dataframe

In [83]:
# By column names
df[['col1','col2']]

Unnamed: 0,col1,col2
SF,1,10
LA,2,20
DEN,3,30
PITT,4,40


In [86]:
# Select cells by logic for column 1
df[df['col1']>2]

Unnamed: 0,col1,col2
DEN,3,30
PITT,4,40


## Calculate summary statistics on a dataframe

In [105]:
# Get sums by rows
df.sum(axis=1)

SF      11
LA      22
DEN     33
PITT    44
dtype: int64

In [104]:
# Get sums by columns
df.sum(axis=0)

col1     10
col2    100
dtype: int64

In [108]:
# Find the index of the lowest values in each column, which idxmax for the max
df.idxmin()

col1    SF
col2    SF
dtype: object

In [111]:
# Accumulation sum along the columns (0) or rows(1)
df.cumsum(axis=0)

Unnamed: 0,col1,col2
SF,1,10
LA,3,30
DEN,6,60
PITT,10,100


In [112]:
# Calculate descriptive statistics on the dataframe
df.describe()

Unnamed: 0,col1,col2
count,4.0,4.0
mean,2.5,25.0
std,1.290994,12.909944
min,1.0,10.0
25%,1.75,17.5
50%,2.5,25.0
75%,3.25,32.5
max,4.0,40.0


## Remove rows without complete cases

In [140]:
# Make dataFrame
df_empty = DataFrame(np.arange(25).reshape(5,5))

# Replace values to NaN
df_empty[df_empty.isin([5,18])] = np.nan

# Drop the rows without complete data - should drop 2 rows from 5
print df_empty.dropna()

# Modify NaN values to 100
print df_empty.fillna(100)

    0   1   2   3   4
0   0   1   2   3   4
2  10  11  12  13  14
4  20  21  22  23  24
     0   1   2    3   4
0    0   1   2    3   4
1  100   6   7    8   9
2   10  11  12   13  14
3   15  16  17  100  19
4   20  21  22   23  24


## Handling mulitindexes with dataframes

In [141]:
# Generate new dataframe with multiindex
df_mi = DataFrame(np.arange(16).reshape(4,4),
                 index=['A','B','C','D'],
                 columns=[['BUF', 'BUF', 'PITT', 'PITT'],
                        ['hockey', 'football', 'hockey', 'football']])
df_mi

Unnamed: 0_level_0,BUF,BUF,PITT,PITT
Unnamed: 0_level_1,hockey,football,hockey,football
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [142]:
# Assign identifiers for multiindexed columns
df_mi.columns.names = ['city', 'sport']
df_mi

city,BUF,BUF,PITT,PITT
sport,hockey,football,hockey,football
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [146]:
# Perform operations on levels of multiindex
df_mi.sum(level='city', axis=1)

city,BUF,PITT
A,1,5
B,9,13
C,17,21
D,25,29


In [147]:
#START LECTURE 25