In [145]:
import pandas as pd

#pd.Series?

In [146]:
animals = ['Tiger', 'Bear', 'Moose']

In [147]:
#Pandas can construct series. They are automatically indexed.
pd.Series(animals)

0    Tiger
1     Bear
2    Moose
dtype: object

In [148]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [149]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)

0    Tiger
1     Bear
2     None
dtype: object

In [150]:
pd.Series([1,2,None])
#NaN = Not a Number
#Note that the cell did not contain NONE, but NaN

0    1.0
1    2.0
2    NaN
dtype: float64

In [151]:
import numpy as np
np.nan == None

False

In [152]:
np.nan == np.nan

False

In [153]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [154]:
s.index

Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [155]:
s = pd.Series(['Tiger','Bear','Moose'],
              index = ['India', 'America', 'Canada'])
s

India      Tiger
America     Bear
Canada     Moose
dtype: object

In [156]:
s = pd.Series(['Tiger','Bear','Moose'])
s

0    Tiger
1     Bear
2    Moose
dtype: object

# Querying a Series

In [157]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [158]:
#iloc for numerical index
s.iloc[3]

'South Korea'

In [159]:
#loc for key string value
s.loc['Golf']

'Scotland'

In [160]:
#If we use a number, it uses the iloc attribute
s[3]

'South Korea'

In [161]:
#If we use a label, it uses the loc attribute
s['Archery']

'Bhutan'

In [162]:
sports = {99: 'Bhutan',
         100: 'Scotland'}
s = pd.Series(sports)
s

99       Bhutan
100    Scotland
dtype: object

In [163]:
s.iloc[0]
#s[0] returns an error because pandas indexed it with the keys (col 1)
#where 0 don't exist

'Bhutan'

In [164]:
import numpy as np

s = pd.Series(np.random.randint(0,100,7))
s

0    96
1    40
2    74
3    33
4    43
5    93
6    86
dtype: int32

In [165]:
total = np.sum(s)
print (total)

465


In [166]:
s = pd.Series(np.random.randint(0,1000,10000))
s.head()

0    937
1     66
2    849
3    966
4    339
dtype: int32

In [167]:
%%timeit -n 500

summary = 0
for item in s:
    summary += item

500 loops, best of 3: 896 µs per loop


In [168]:
%%timeit -n 500
sumary = np.sum(s)

500 loops, best of 3: 99.1 µs per loop


In [169]:
s += 2
s.head()

0    939
1     68
2    851
3    968
4    341
dtype: int32

In [170]:
s = pd.Series([1, 2, 3])
s.loc['Car'] = 'Aston Martin'
s

0                 1
1                 2
2                 3
Car    Aston Martin
dtype: object

# The DataFrame Data Structure

The DataFrame is the heart of Pandas library.

In [171]:
p1 = pd.Series({'Name':'Carson',
               'Items Purchased':'Rockets',
               'Cost':55.99})
p2 = pd.Series({'Name':'Kelvin',
               'Items Purchased':'MacBook Pro',
               'Cost':249.99})
p3 = pd.Series({'Name':'Deacon',
               'Items Purchased':'Cat Food',
               'Cost':34.99})

df = pd.DataFrame([p1, p2, p3], index = ['Store 1', 
                                        'Store 1',
                                        'Store 3'])
df.head()

Unnamed: 0,Cost,Items Purchased,Name
Store 1,55.99,Rockets,Carson
Store 1,249.99,MacBook Pro,Kelvin
Store 3,34.99,Cat Food,Deacon


In [172]:
df.loc['Store 3']

Cost                  34.99
Items Purchased    Cat Food
Name                 Deacon
Name: Store 3, dtype: object

In [173]:
type(df.loc['Store 3'])

pandas.core.series.Series

In [174]:
df.loc['Store 1']

Unnamed: 0,Cost,Items Purchased,Name
Store 1,55.99,Rockets,Carson
Store 1,249.99,MacBook Pro,Kelvin


In [175]:
type(df.loc['Store 1'])

pandas.core.frame.DataFrame

In [176]:
df['Cost']

Store 1     55.99
Store 1    249.99
Store 3     34.99
Name: Cost, dtype: float64

In [177]:
df.loc['Store 1','Cost']

Store 1     55.99
Store 1    249.99
Name: Cost, dtype: float64

In [178]:
df.T

Unnamed: 0,Store 1,Store 1.1,Store 3
Cost,55.99,249.99,34.99
Items Purchased,Rockets,MacBook Pro,Cat Food
Name,Carson,Kelvin,Deacon


In [179]:
df['Items Purchased']

Store 1        Rockets
Store 1    MacBook Pro
Store 3       Cat Food
Name: Items Purchased, dtype: object

In [180]:
df.loc['Store 1']['Cost']

Store 1     55.99
Store 1    249.99
Name: Cost, dtype: float64

In [181]:
df.loc[:,['Name','Cost']]

Unnamed: 0,Name,Cost
Store 1,Carson,55.99
Store 1,Kelvin,249.99
Store 3,Deacon,34.99


In [182]:
df.drop('Store 1')

Unnamed: 0,Cost,Items Purchased,Name
Store 3,34.99,Cat Food,Deacon


In [183]:
df

Unnamed: 0,Cost,Items Purchased,Name
Store 1,55.99,Rockets,Carson
Store 1,249.99,MacBook Pro,Kelvin
Store 3,34.99,Cat Food,Deacon


In [184]:
#inplace = TRUE will update the original dataframe referenced
cdf = df.copy()
cdf = cdf.drop('Name', axis = 1)
cdf

# axis = 1, 0. axis = 0 => x-axis, axis = 1 => y-axis

Unnamed: 0,Cost,Items Purchased
Store 1,55.99,Rockets
Store 1,249.99,MacBook Pro
Store 3,34.99,Cat Food


In [185]:
#NOTE the del command take permanent effect
del cdf['Cost']
cdf

Unnamed: 0,Items Purchased
Store 1,Rockets
Store 1,MacBook Pro
Store 3,Cat Food


In [186]:
df['Location'] = None
df

Unnamed: 0,Cost,Items Purchased,Name,Location
Store 1,55.99,Rockets,Carson,
Store 1,249.99,MacBook Pro,Kelvin,
Store 3,34.99,Cat Food,Deacon,


In [187]:
df

Unnamed: 0,Cost,Items Purchased,Name,Location
Store 1,55.99,Rockets,Carson,
Store 1,249.99,MacBook Pro,Kelvin,
Store 3,34.99,Cat Food,Deacon,


In [188]:
df['Cost'] *= 0.8
df

Unnamed: 0,Cost,Items Purchased,Name,Location
Store 1,44.792,Rockets,Carson,
Store 1,199.992,MacBook Pro,Kelvin,
Store 3,27.992,Cat Food,Deacon,


# Dataframe Indexing and Loading

In [191]:
costs = df['Cost']
costs

Store 1     44.792
Store 1    199.992
Store 3     27.992
Name: Cost, dtype: float64

In [192]:
costs += 2
costs

Store 1     46.792
Store 1    201.992
Store 3     29.992
Name: Cost, dtype: float64

In [193]:
df

Unnamed: 0,Cost,Items Purchased,Name,Location
Store 1,46.792,Rockets,Carson,
Store 1,201.992,MacBook Pro,Kelvin,
Store 3,29.992,Cat Food,Deacon,


# Example DataFrame

In [196]:
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})

df = pd.DataFrame([purchase_1, purchase_2, purchase_3], 
                  index=['Store 1', 'Store 1', 'Store 2'])

df

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [197]:
df['Location'] = df.index
df

Unnamed: 0,Cost,Item Purchased,Name,Location
Store 1,22.5,Dog Food,Chris,Store 1
Store 1,2.5,Kitty Litter,Kevyn,Store 1
Store 2,5.0,Bird Seed,Vinod,Store 2


In [198]:
df = df.set_index(['Location','Name'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Cost,Item Purchased
Location,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
Store 1,Chris,22.5,Dog Food
Store 1,Kevyn,2.5,Kitty Litter
Store 2,Vinod,5.0,Bird Seed


# Adding New Entries into a DataFrame
Construct new entry as a series in the same format as the df

In [199]:
nf = pd.Series(data={'Cost': 3.00, 
                'Item Purchased': 'Kitty Food'}, 
          name=('Store 2', 'Kevyn'))
nf

Cost                       3
Item Purchased    Kitty Food
Name: (Store 2, Kevyn), dtype: object

In [200]:
df = df.append(nf)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Cost,Item Purchased
Location,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
Store 1,Chris,22.5,Dog Food
Store 1,Kevyn,2.5,Kitty Litter
Store 2,Vinod,5.0,Bird Seed
Store 2,Kevyn,3.0,Kitty Food


# Sketch Pad

In [10]:
import pandas as pd
import numpy as np

series = pd.Series(np.random.randn(1000))
srs = series

srs[::2] = np.nan

print (srs.describe())

count    500.000000
mean       0.091008
std        1.017230
min       -3.051167
25%       -0.634418
50%        0.077845
75%        0.822614
max        3.000042
dtype: float64


In [11]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
frame.ix[::2] = np.nan

frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.003659,0.01399,0.033146,-0.049145,0.083163
std,0.973855,1.048377,1.012125,0.936336,1.009576
min,-2.69512,-2.885596,-3.00502,-2.604747,-2.921265
25%,-0.638134,-0.660497,-0.667084,-0.697719,-0.582626
50%,-0.037214,0.050819,0.062438,-0.048597,0.123977
75%,0.617234,0.718401,0.760607,0.591313,0.698098
max,2.972838,2.773919,3.344598,3.545762,3.696525
