In [2]:
import numpy as np
import pandas as pd

In [3]:
# series as a specialized dictionary
population_dict = {
    "California": 39538223,
    "Texas": 29145505,
    "Florida": 21538187,
    "New York": 20201249,
    "Pennsylvania": 13002700,
}
population = pd.Series(population_dict)
population

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [4]:
# dictionary actions can be performed on series
population["California"]

39538223

In [5]:
# can slice series
population["California":"Florida"]

California    39538223
Texas         29145505
Florida       21538187
dtype: int64

In [6]:
# dataframe is a 2D array with a sequence of aligned 1D columns(series)
area_dict = {
    "California": 423967,
    "Texas": 695662,
    "Florida": 170312,
    "New York": 141297,
    "Pennsylvania": 119280,
}
area = pd.Series(area_dict)
area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64

In [7]:
# now that we have this along with population, we can use a dictionary to construct a single 2D object
states = pd.DataFrame({"population": population, "area": area})
states

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [8]:
states.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [10]:
states.columns

Index(['population', 'area'], dtype='object')

In [11]:
# can create a dataframe from a Numpy structured array
A = np.zeros(3, dtype=[("A", "i8"), ("B", "f8")])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [12]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [13]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [15]:
# index is an immutable array
ind[1]

3

In [16]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [17]:
# index objects also have many of the attributes familiar from NumPy arrays
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [19]:
# cannot be modified via normal means
# ind[1] = 0 This will throw an error

In [20]:
# index object follows many of the conventions used by python's built-in set data structure
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indA.intersection(indB)

Int64Index([3, 5, 7], dtype='int64')

In [21]:
indA.union(indB)

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [22]:
indA.symmetric_difference(indB)

Int64Index([1, 2, 9, 11], dtype='int64')

In [23]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a", "b", "c", "d"])

data["b"]

0.5

In [24]:
"a" in data

True

In [25]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [26]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [28]:
# can modify series in place or extend it
data["e"] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [29]:
# series as one-dimensional array
# slicing by explicit index
data["a":"c"]

a    0.25
b    0.50
c    0.75
dtype: float64

In [30]:
# slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [31]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [33]:
# fancy indexing
data[["a", "e"]]
# of these, slicing may be the source of the most confusion the final index is included in the slice while when slicing using an implicit index, the final index is excluded from the slice

a    0.25
e    1.25
dtype: float64

In [34]:
# idexers loc, iloc
# if your series has an explicit integer index an indexing operation such as data[1] will use the explicit indices, while a slicing operation like data[1:3] will use the implicit python-style index
data = pd.Series(["a", "b", "c"], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [35]:
# explicit index when indexing'
data[1]

'a'

In [37]:
# implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [38]:
# the loc attribute allows indexing and slicing that always references the explicit index
data.loc[1]

'a'

In [39]:
data.loc[1:3]

1    a
3    b
dtype: object

In [40]:
# the iloc attribute allows indexing and slicing that always references the implicit python-style index
data.iloc[1]

'b'

In [41]:
data.iloc[1:3]
# in python explicit is better than implicit the loc attribute is the explicit version of the implicit python style index while iloc is the explicit version of the implicit python style index

3    b
5    c
dtype: object

In [43]:
# dataframe acts in many ways like a two dimensional or structured array and in other ways like a dictionary of series structures sharing the same index this is helpful to keep in mind as we explore data manipulation
area = pd.Series(
    {
        "California": 423967,
        "Texas": 695662,
        "Florida": 170312,
        "New York": 141297,
        "Pennsylvania": 119280,
    }
)
population = pd.Series(
    {
        "California": 39538223,
        "Texas": 29145505,
        "Florida": 21538187,
        "New York": 20201249,
        "Pennsylvania": 13002700,
    }
)
data = pd.DataFrame({"area": area, "population": population})
data

Unnamed: 0,area,population
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187
New York,141297,20201249
Pennsylvania,119280,13002700


In [45]:
data["area"]

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [46]:
# we can use attribute-style access with column names that are strings
data.area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [47]:
# it does not work for all cases. if the column names are not strings or if the column names conflict with methods of the dataframe so if we had named population pop then data.pop would not refer to the population column but rather the pop method
# you should avoid the temptation to try column assignment via attribute (i.e. use data['pop'] = z rather than data.pop = z)
# like with the series objects, this dictionary-style syntax can also be used to modify the object in place
data["density"] = data["population"] / data["area"]
data

Unnamed: 0,area,population,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [48]:
data.values

array([[4.23967000e+05, 3.95382230e+07, 9.32577842e+01],
       [6.95662000e+05, 2.91455050e+07, 4.18960717e+01],
       [1.70312000e+05, 2.15381870e+07, 1.26463121e+02],
       [1.41297000e+05, 2.02012490e+07, 1.42970120e+02],
       [1.19280000e+05, 1.30027000e+07, 1.09009893e+02]])

In [49]:
data.T

Unnamed: 0,California,Texas,Florida,New York,Pennsylvania
area,423967.0,695662.0,170312.0,141297.0,119280.0
population,39538220.0,29145500.0,21538190.0,20201250.0,13002700.0
density,93.25778,41.89607,126.4631,142.9701,109.0099


In [50]:
# when it comes to indexing of a dataframe passing a single index to an array accesses a row
data.values[0]

array([4.23967000e+05, 3.95382230e+07, 9.32577842e+01])

In [51]:
# and passing a single index to a dataframe accesses a column
data["area"]

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [52]:
data.iloc[:3, :2]

Unnamed: 0,area,population
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [53]:
# we can use loc indexer with the names of the index and columns
data.loc[:"Florida", :"population"]

Unnamed: 0,area,population
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [54]:
# any of the familiar numpy style data access patterns can be used within these indexers
# we can combine masking and fancy indexing as in the following
data.loc[data.density > 120, ["population", "density"]]

Unnamed: 0,population,density
Florida,21538187,126.463121
New York,20201249,142.97012


In [55]:
# any of these indexing conventions may also be used to set or modify values
data.iloc[0, 2] = 90
data

Unnamed: 0,area,population,density
California,423967,39538223,90.0
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [56]:
# additional indexing conventions
# while indexing refers to columns, slicing refers to rows
data["Florida":"New York"]

Unnamed: 0,area,population,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


In [57]:
# slices can also refer to rows by number rather than by index
data[1:3]

Unnamed: 0,area,population,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121


In [58]:
# direct masking operationa are interpreted row wise rather than column wise
data[data.density > 120]

Unnamed: 0,area,population,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


Operations in pandas

In [60]:
# one of the strengths of numpy is that it allows us to perform quick operations on entire arrays of data. pandas inherits much of this functionality from numpy and this is one of the reasons why it is well suited for data science
# pandas includes a couple of useful methods for this purpose: idxmin() and idxmax() which return the index value where the minimum or maximum values are attained respectively
# INDEX Preservation
rng = np.random.default_rng(42)
ser = pd.Series(rng.integers(0, 10, 4))
ser

0    0
1    7
2    6
3    4
dtype: int64

In [61]:
df = pd.DataFrame(rng.integers(0, 10, (3, 4)), columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
0,4,8,0,6
1,2,0,5,9
2,7,7,7,7


In [62]:
# if we apply a Numpy ufunc on either of these objects, the result will be another pandas object with the indices preserved
np.exp(ser)

0       1.000000
1    1096.633158
2     403.428793
3      54.598150
dtype: float64

In [63]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,1.224647e-16,-2.449294e-16,0.0,-1.0
1,1.0,0.0,-0.707107,0.707107
2,-0.7071068,-0.7071068,-0.707107,-0.707107


Ufuncs: Index Preservation

In [66]:
# for binary operations on two series or dataframe objects, pandas will align indices in the process of performing the operation
area = pd.Series(
    {"Alaska": 1723337, "Texas": 695662, "California": 423967}, name="area"
)
population = pd.Series(
    {"California": 39538223, "Texas": 29145505, "Florida": 21538187}, name="population"
)
population / area
# the resulting array contains the union of indices of the two input arrays

Alaska              NaN
California    93.257784
Florida             NaN
Texas         41.896072
dtype: float64

In [67]:
area.index.union(population.index)

Index(['Alaska', 'California', 'Florida', 'Texas'], dtype='object')

In [68]:
# any item for which one or the other does not have an entry is marked with NaN, or "Not a Number", which is how Pandas marks missing data
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [69]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [74]:
# index alignment in dataframes
A = pd.DataFrame(rng.integers(0, 20, (2, 2)), columns=["a", "b"])
A

Unnamed: 0,a,b
0,1,11
1,17,1


In [75]:
B = pd.DataFrame(rng.integers(0, 10, (3, 3)), columns=["b", "a", "c"])
B

Unnamed: 0,b,a,c
0,8,8,2
1,6,1,7
2,7,3,0


In [76]:
A + B

Unnamed: 0,a,b,c
0,9.0,19.0,
1,18.0,7.0,
2,,,


In [77]:
# indices are aligned correctly irrespectively of their order in the two objects, and indices in the result are sorted
A.add(B, fill_value=A.values.mean())

Unnamed: 0,a,b,c
0,9.0,19.0,9.5
1,18.0,7.0,14.5
2,10.5,14.5,7.5


In [78]:
# operations between dataframes and series
A = rng.integers(10, size=(3, 4))
A

array([[9, 4, 8, 6],
       [7, 7, 1, 3],
       [4, 4, 0, 5]])

In [79]:
A - A[0]

array([[ 0,  0,  0,  0],
       [-2,  3, -7, -3],
       [-5,  0, -8, -1]])

In [80]:
df = pd.DataFrame(A, columns=list("QRST"))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-2,3,-7,-3
2,-5,0,-8,-1


In [81]:
df.subtract(df["R"], axis=0)

Unnamed: 0,Q,R,S,T
0,5,0,4,2
1,0,0,-6,-4
2,0,0,-4,1


In [82]:
halfrow = df.iloc[0, ::2]
halfrow

Q    9
S    8
Name: 0, dtype: int64

In [83]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-2.0,,-7.0,
2,-5.0,,-8.0,


Missing Data in pandas

In [85]:
# there are a couple ways to deal with missing data masking and sentinel approach.
# for some data types pandas uses None as a sentinel value
# if we pass None to NumPy arrays
vals1 = np.array([1, None, 2, 3])
vals1

array([1, None, 2, 3], dtype=object)

In [86]:
# this dtype=object means that the best common type representation numpy could infer for the contents of the array is that they are python objects the downside is that operations on the data will be done at the python level with more overhead
%timeit np.arange(1E6, dtype='int').sum()

1.76 ms ± 18.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [87]:
%timeit np.arange(1E6, dtype='object').sum()

65.2 ms ± 245 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [89]:
# beacuse python does not support arithmetic operations on None, aggregations like sum or min will generate an error
# vals1.sum() # this will generate an error

In [90]:
# NaN
# the other missing data sentinel NaN is different it is a special float value recognized by all systems that use the standard IEEE floating-point representation
vals2 = np.array([1, np.nan, 3, 4])
vals2

array([ 1., nan,  3.,  4.])

In [91]:
# NaN is a bit like a data virus it infects any other object it touches
print(1 + np.nan)
print(0 * np.nan)

nan
nan


In [92]:
# this means that aggregates over the values are well defined but not always useful
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [93]:
# numpy has NaN aware versions of aggregations that will ignore these missing np.NaN values
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)
# there is no equivalent NaN for integer, strings, or other types

(8.0, 1.0, 4.0)

In [94]:
# NaN and None in Pandas
# Pandas handles both NaN and None interchangeably converting between them as appropriate
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [95]:
# for types that don't have an available sentinel value for missing data, Pandas automatically type-casts when NA values are present. if we set a value in an integer value to np.nan, it will automatically be upcast to a floating point type to accommodate the NA
x = pd.Series(range(2), dtype=int)
x

0    0
1    1
dtype: int64

In [96]:
x[0] = None
x
# in addition to upcasting the integer array to floating point, Pandas automatically converts None to a NaN value

0    NaN
1    1.0
dtype: float64

In [97]:
# pandas nullable Dtypes
pd.Series([1, np.nan, 2, None, pd.NA], dtype="Int32")

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

In [98]:
# detecting null values
# pandas has isnull() and notnull() functions either one will return a boolean mask over the data
data = pd.Series([1, np.nan, "hello", None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [99]:
# Boolean masks can be used directly as a series or dataframe index
data[data.notnull()]

0        1
2    hello
dtype: object

In [100]:
# dropping null values there is drop na and fill na
data.dropna()

0        1
2    hello
dtype: object

In [101]:
# for a dataframe there are more options
df = pd.DataFrame([[1, np.nan, 2], [2, 3, 5], [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [102]:
# we cannot drop single values from a dataframe we can only drop full rows or full columns
# by default dropna() will drop all rows in which any null value is present
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [103]:
# you can drop NA values along a different axis using the axis=1 or axis='columns' argument drops all col containing a null
df.dropna(axis="columns")

Unnamed: 0,2
0,2
1,5
2,6


In [104]:
# this drops some good data as well. this can be specified with how or thresh parameters to control the number of nulls
# the default is how='any' which drops any row or column with all null values
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [105]:
df.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [106]:
df.dropna(axis="rows", thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [108]:
# filling null values fillna() function which returns a copy of the array with the null values replaced
# isnull() does this in place
data = pd.Series([1, np.nan, 2, None, 3], index=list("abcde"), dtype="Int32")
data

a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32

In [109]:
data.fillna(0)

a    1
b    0
c    2
d    0
e    3
dtype: Int32

In [110]:
# we can specify a forward-fill to propagate the previous value forward
data.fillna(method="ffill")

a    1
b    1
c    2
d    2
e    3
dtype: Int32

In [111]:
# we can also specify a back-fill to propagate the next values backward
data.fillna(method="bfill")

a    1
b    2
c    2
d    3
e    3
dtype: Int32

In [112]:
# we can also specify an axis along which the fills should take place
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [114]:
df.fillna(method="ffill", axis=1)
# if the previous value is a NA value then the next value will also be NA

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0
