In [1]:
import pandas as pd
import numpy as np

# 5.1 Introduction to pandas Data Structures

To get started with pandas, you will need to get comfortable with its two workhorse data structures: Series and DataFrame

## Series

A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index.


In [2]:
obj = pd.Series([4, 5, 6, 7, 8])

In [3]:
obj

0    4
1    5
2    6
3    7
4    8
dtype: int64

In [4]:
obj.values

array([4, 5, 6, 7, 8], dtype=int64)

In [5]:
obj.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
obj2 = pd.Series(data = [4, 5, 6, 7, 8], index = ['a', 'b', 'c', 'd', 'e'])

In [7]:
obj2

a    4
b    5
c    6
d    7
e    8
dtype: int64

In [8]:
obj2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [9]:
obj2['b']

5

In [10]:
obj2['c'] = 98

In [11]:
obj2

a     4
b     5
c    98
d     7
e     8
dtype: int64

In [12]:
obj2[['a', 'b', 'c']]

a     4
b     5
c    98
dtype: int64

In [13]:
obj2

a     4
b     5
c    98
d     7
e     8
dtype: int64

In [14]:
obj2 > 5

a    False
b    False
c     True
d     True
e     True
dtype: bool

In [15]:
obj2[obj2 > 5]

c    98
d     7
e     8
dtype: int64

In [16]:
obj2 * 2

a      8
b     10
c    196
d     14
e     16
dtype: int64

In [17]:
obj2 ** 2

a      16
b      25
c    9604
d      49
e      64
dtype: int64

In [18]:
np.exp(obj2)

a    5.459815e+01
b    1.484132e+02
c    3.637971e+42
d    1.096633e+03
e    2.980958e+03
dtype: float64

Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping of index values to data values. It can be used in many contexts where you might use a dict

In [19]:
obj2

a     4
b     5
c    98
d     7
e     8
dtype: int64

In [20]:
'd' in obj2

True

In [21]:
'g' not in obj2

True

In [23]:
dict_data = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [24]:
dict_data

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [30]:
obj3 = pd.Series(data = dict_data)

In [31]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [34]:
states = ['California', 'Oregon', 'Ohio', 'Utah']

In [35]:
states

['California', 'Oregon', 'Ohio', 'Utah']

In [36]:
obj4 = pd.Series(data = dict_data, index = states)

In [37]:
obj4

California        NaN
Oregon        16000.0
Ohio          35000.0
Utah           5000.0
dtype: float64

In [39]:
pd.isnull(obj4)

California     True
Oregon        False
Ohio          False
Utah          False
dtype: bool

In [40]:
pd.notnull(obj4)

California    False
Oregon         True
Ohio           True
Utah           True
dtype: bool

In [41]:
pd.notna(obj4)

California    False
Oregon         True
Ohio           True
Utah           True
dtype: bool

In [42]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [43]:
obj4

California        NaN
Oregon        16000.0
Ohio          35000.0
Utah           5000.0
dtype: float64

In [44]:
obj3 + obj4

California        NaN
Ohio          70000.0
Oregon        32000.0
Texas             NaN
Utah          10000.0
dtype: float64

A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations

In [45]:
obj4

California        NaN
Oregon        16000.0
Ohio          35000.0
Utah           5000.0
dtype: float64

In [46]:
obj4.name = 'population'

In [47]:
obj4

California        NaN
Oregon        16000.0
Ohio          35000.0
Utah           5000.0
Name: population, dtype: float64

In [48]:
obj4.index.name = 'states'

In [50]:
obj4

states
California        NaN
Oregon        16000.0
Ohio          35000.0
Utah           5000.0
Name: population, dtype: float64

In [51]:
obj

0    4
1    5
2    6
3    7
4    8
dtype: int64

In [52]:
obj.index = ['bob', 'smith', 'ali', 'dan', 'hud']

In [53]:
obj

bob      4
smith    5
ali      6
dan      7
hud      8
dtype: int64

A Series’s index can be altered in-place by assignment

## DataFrame

A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.).

The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index.

In [59]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002, 2003], 
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [60]:
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [61]:
df = pd.DataFrame(data)

In [62]:
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [64]:
df.head() #head() selects only the first five rows, used for larger df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [71]:
pd.DataFrame(data=data, columns=['pop', 'state', 'year'])

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


If you pass a column that isn’t contained in the dict, it will appear with missing values in the result

In [72]:
df2 = pd.DataFrame(data, columns=['pop', 'state', 'year', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six'])

In [73]:
df2

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,
two,1.7,Ohio,2001,
three,3.6,Ohio,2002,
four,2.4,Nevada,2001,
five,2.9,Nevada,2002,
six,3.2,Nevada,2003,


In [74]:
df2.columns

Index(['pop', 'state', 'year', 'debt'], dtype='object')

In [75]:
df2.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute

In [76]:
df2['year'] #More conventional one to select columns from df

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [79]:
df2.year #if method name and column name would be same then calamity will be upon us

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [80]:
df2.loc['three'] #To select rows from df use loc for user choosen index or iloc for default integer index

pop       3.6
state    Ohio
year     2002
debt      NaN
Name: three, dtype: object

In [81]:
df2.iloc[2]

pop       3.6
state    Ohio
year     2002
debt      NaN
Name: three, dtype: object

In [82]:
df2

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,
two,1.7,Ohio,2001,
three,3.6,Ohio,2002,
four,2.4,Nevada,2001,
five,2.9,Nevada,2002,
six,3.2,Nevada,2003,


In [83]:
df2['debt'] = np.mean(df['pop'])

In [84]:
df2

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,2.55
two,1.7,Ohio,2001,2.55
three,3.6,Ohio,2002,2.55
four,2.4,Nevada,2001,2.55
five,2.9,Nevada,2002,2.55
six,3.2,Nevada,2003,2.55


In [85]:
df2['debt'] = np.arange(6.0)

In [86]:
df2

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,0.0
two,1.7,Ohio,2001,1.0
three,3.6,Ohio,2002,2.0
four,2.4,Nevada,2001,3.0
five,2.9,Nevada,2002,4.0
six,3.2,Nevada,2003,5.0


When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes

In [89]:
col = pd.Series(data=[-1.5, 4.56, -8.5], index=['two', 'three', 'six'])

In [90]:
col

two     -1.50
three    4.56
six     -8.50
dtype: float64

In [92]:
df2['debt'] = col

In [93]:
df2

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,
two,1.7,Ohio,2001,-1.5
three,3.6,Ohio,2002,4.56
four,2.4,Nevada,2001,
five,2.9,Nevada,2002,
six,3.2,Nevada,2003,-8.5


In [94]:
df2.notna()

Unnamed: 0,pop,state,year,debt
one,True,True,True,False
two,True,True,True,True
three,True,True,True,True
four,True,True,True,False
five,True,True,True,False
six,True,True,True,True


In [95]:
df2['eastern'] = df2['state'] == 'Ohio'

In [96]:
df2

Unnamed: 0,pop,state,year,debt,eastern
one,1.5,Ohio,2000,,True
two,1.7,Ohio,2001,-1.5,True
three,3.6,Ohio,2002,4.56,True
four,2.4,Nevada,2001,,False
five,2.9,Nevada,2002,,False
six,3.2,Nevada,2003,-8.5,False


New columns cannot be created with the df2.eastern syntax

In [97]:
df2

Unnamed: 0,pop,state,year,debt,eastern
one,1.5,Ohio,2000,,True
two,1.7,Ohio,2001,-1.5,True
three,3.6,Ohio,2002,4.56,True
four,2.4,Nevada,2001,,False
five,2.9,Nevada,2002,,False
six,3.2,Nevada,2003,-8.5,False


In [99]:
del df2['eastern'] #deletes column permenantly

In [100]:
df2

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,
two,1.7,Ohio,2001,-1.5
three,3.6,Ohio,2002,4.56
four,2.4,Nevada,2001,
five,2.9,Nevada,2002,
six,3.2,Nevada,2003,-8.5


Another common form of data is a nested dict of dicts

In [101]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [104]:
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys as the columns and the inner keys as the row indices

In [106]:
df4 = pd.DataFrame(pop, index=[2000,2001,2002])

In [109]:
df4

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


You can transpose the DataFrame (swap rows and columns) with similar syntax to a NumPy array

In [111]:
df4.transpose()

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [112]:
df4.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [114]:
dict_df = {'Ohio': df4['Ohio'][:-1], 'Nevada': df4['Nevada'][:-1]}

In [119]:
pd.DataFrame(dict_df, columns=['Nevada', 'Ohio'])

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [120]:
df4

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [123]:
df4.index.name = 'Year'
df4.columns.name = "States"

In [124]:
df4

States,Nevada,Ohio
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [129]:
df4.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [130]:
df2

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,
two,1.7,Ohio,2001,-1.5
three,3.6,Ohio,2002,4.56
four,2.4,Nevada,2001,
five,2.9,Nevada,2002,
six,3.2,Nevada,2003,-8.5


In [131]:
df2.values

array([[1.5, 'Ohio', 2000, nan],
       [1.7, 'Ohio', 2001, -1.5],
       [3.6, 'Ohio', 2002, 4.56],
       [2.4, 'Nevada', 2001, nan],
       [2.9, 'Nevada', 2002, nan],
       [3.2, 'Nevada', 2003, -8.5]], dtype=object)

Table 5-1. Possible data inputs to DataFrame constructor

## Index Objects

In [132]:
obj = pd.Series(data=range(3), index=['a', 'b', 'c'])

In [133]:
obj

a    0
b    1
c    2
dtype: int64

In [135]:
index = obj.index

In [136]:
index

Index(['a', 'b', 'c'], dtype='object')

In [138]:
index[:-1]

Index(['a', 'b'], dtype='object')

Index objects are immutable and thus can’t be modified by the user

In [139]:
index[2]

'c'

In [144]:
"index[2] = 'z'" #Type Error

"index[2] = 'z'"

Immutability makes it safer to share Index objects among data structures

In [157]:
labels = pd.Index(np.arange(3))

In [158]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [159]:
obj2 = pd.Series([1.5, -6.5, +7.4], index=labels)

In [160]:
obj2

0    1.5
1   -6.5
2    7.4
dtype: float64

In [162]:
obj2.index is labels

True

In [164]:
df4

States,Nevada,Ohio
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [165]:
df4.columns

Index(['Nevada', 'Ohio'], dtype='object', name='States')

In [167]:
'Nevada' in df4.columns

True

In [168]:
2003 in df4.columns

False

Unlike Python sets, a pandas Index can contain duplicate labels

In [169]:
dup_index = pd.Index(['a', 'a', 'b', 'c' ])

In [172]:
ser1 = pd.Series(data=[1, 2, 2, 5], index=dup_index)
ser1

a    1
a    2
b    2
c    5
dtype: int64

Selections with duplicate labels will select all occurrences of that label

In [176]:
ser1['a'][0]

1

In [177]:
ser1['a'][1]

2

See Table 5-2 for some Index methods and properties

# 5.2 Essential Functionality