# SERIES

A Series is a one-dimensional array-like object containing a sequence of values and its corresponding  index. 

In [3]:
import pandas as pd
obj = pd.Series([4, 7, -5, 3])  
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.array              #shows the array representation of the Series

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [5]:
obj.index              #shows the index object of the Series

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = pd.Series([4,7,-5,3], index=['a', 'b', 'c', 'd'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [7]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:
obj2['a']    

4

prints out the value at index 'a'

In [9]:
obj2['c'] = -10
obj2[["a", "c", "d"]]

a     4
c   -10
d     3
dtype: int64

In [10]:
obj2[obj2 > 0]

a    4
b    7
d    3
dtype: int64

In [11]:
obj2 * 2

a     8
b    14
c   -20
d     6
dtype: int64

In [12]:
import numpy as np
np.exp(obj2)

a      54.598150
b    1096.633158
c       0.000045
d      20.085537
dtype: float64

In [27]:
"b" in obj2

True

In [28]:
"e" in obj2

False

# Changing a Dictionary to a series

In [29]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [30]:
obj3.to_dict()     # changing a Series into a Dictionary

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [31]:
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

# isna & notna

In [32]:
pd.isna(obj4)    #The isna and notna functions in pandas should be used to detect missing data:

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

same as: 

In [33]:
pd.notna(obj4) 

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

NOTE: Series also has these as instance methods:

In [34]:
obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [35]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [36]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [37]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [38]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

# NOTE: 

A Series’s index can be altered in place by assignment:

In [39]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [40]:
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]
obj             

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

# DataFrame

In [41]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame                              #The columns are placed according to the order of the keys in data

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [42]:
frame.head()    # the head method selects only the first five rows

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [43]:
frame.tail()     # the tail method selects only the first five rows

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:


In [44]:
pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


If you pass a column that isn’t contained in the dictionary, it will appear with missing values in the result: 'Nan'

In [45]:
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [58]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

A column in a DataFrame can be retrieved as a Series either by dictionary-like notation or by using the dot attribute notation:

In [59]:
frame2['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [60]:
frame2.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [61]:
frame2.loc[1]

year     2001
state    Ohio
pop       1.7
debt        1
Name: 1, dtype: object

In [62]:
frame2.iloc[2]

year     2002
state    Ohio
pop       3.6
debt        2
Name: 2, dtype: object

NOTE: Columns can be modified by assignment. For example, the empty debt column could be assigned a scalar value or an array of values:



In [63]:
frame2['debt'] = 26
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,26
1,2001,Ohio,1.7,26
2,2002,Ohio,3.6,26
3,2001,Nevada,2.4,26
4,2002,Nevada,2.9,26
5,2003,Nevada,3.2,26


In [64]:
frame2['debt'] = np.arange(6)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0
1,2001,Ohio,1.7,1
2,2002,Ohio,3.6,2
3,2001,Nevada,2.4,3
4,2002,Nevada,2.9,4
5,2003,Nevada,3.2,5


val = pd.Series([-1.2, -1.5, -1.7], index=[2, 4, 5])
frame2["debt"] = val
frame2

In [65]:
frame2["eastern"] = frame2["state"] == "Ohio"
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,0,True
1,2001,Ohio,1.7,1,True
2,2002,Ohio,3.6,2,True
3,2001,Nevada,2.4,3,False
4,2002,Nevada,2.9,4,False
5,2003,Nevada,3.2,5,False


# Del Method

In [66]:
del frame2["eastern"]
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

A nexted Dictionary of Dictonaries

In [67]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}, "Nevada": {2001: 2.4, 2002: 2.9}}

If the nested dictionary is passed to the DataFrame, pandas will interpret 
the outer dictionary keys as the columns, and the inner keys as the row indices:

In [69]:
frame3 = pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [70]:
pd.DataFrame(populations, index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [71]:
pdata = {"Ohio": frame3["Ohio"][:-1], "Nevada": frame3["Nevada"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [73]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


NOTE:  DataFrame's to_numpy method returns the data contained in the DataFrame as a two-dimensional ndarray:

In [74]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0
1,2001,Ohio,1.7,1
2,2002,Ohio,3.6,2
3,2001,Nevada,2.4,3
4,2002,Nevada,2.9,4
5,2003,Nevada,3.2,5


In [75]:
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, 0],
       [2001, 'Ohio', 1.7, 1],
       [2002, 'Ohio', 3.6, 2],
       [2001, 'Nevada', 2.4, 3],
       [2002, 'Nevada', 2.9, 4],
       [2003, 'Nevada', 3.2, 5]], dtype=object)

# Index Objects


In [86]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [87]:
index[1:]

Index(['b', 'c'], dtype='object')

In [92]:
obj2 = pd.Series([1.5, -2.5, 0], index=np.arange(3))
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [93]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [94]:
frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In [95]:
"Ohio" in frame3.columns

True

In [96]:
'Miami' in frame3.columns

False

# 5.2 Essential Functionality

# Reindexing


In [4]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

Calling reindex on this Series rearranges the data according to the new index, 
introducing missing values if any index values were not already present:

In [6]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [7]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [11]:
obj3.reindex(np.arange(6), method="ffill")   
#The method option allows us to do this, using a method such as ffill, which forward-fills the values:

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

NOTE: With DataFrame, reindex can alter the (row) index, columns, or both. When passed only a sequence, it reindexes the rows in the result:

In [13]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=["a", "c", "d"], columns=["Ohio", "Texas", "California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [18]:
frame.reindex(index=["a", "b", "c", "d"])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [21]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


Note: you can also reindex by using the loc operator
This works only if all of the new index labels already exist in the DataFrame (whereas reindex will insert missing data for new labels):

In [29]:
frame.loc[["a", "d", "c"], ["California", "Texas"]]

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


# Dropping Entries from an Axis


In [63]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [64]:
new_obj = obj.drop('c')   #the .drop() works the same as .reindex(). They both can authomatically take index arguments, but if you want to 
                         # use it on the column, you'd have to assign columns = [] in it. 
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [66]:
new_obj = obj.drop('e')   #the .drop() works the same as .reindex(). They both can authomatically take index arguments. 
                         # You can also drop values from the columns by passing x.drop(columns= ['y'])
new_obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [34]:
obj.drop(['c', 'd'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [170]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), columns = ["one", "two", "three", "four"],
                    index = ["Ohio", "Colorado", "Utah", "New York"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [172]:
data.drop(index=["Colorado", "Ohio"])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
data.drop(columns=['two'])   # if you want to drop a column

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


You can also drop values from the columns by passing axis=1 (which is like NumPy) or axis="columns":

In [44]:
data.drop('one', axis=1)

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


In [45]:
data.drop(columns = ['two','four'])

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


OR:

In [46]:
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


# Indexing, Selection, and Filtering

In [47]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [50]:
obj["b"]

1.0

In [67]:
obj[1]

  obj[1]


1.0

In [52]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [56]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [59]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [60]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

NB: While you can select data by label this way, the preferred way to select index values is with the special loc operator:

In [69]:
obj.loc[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [74]:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj1

2    1
0    2
1    3
dtype: int64

In [75]:
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])
obj2

a    1
b    2
c    3
dtype: int64

In [76]:
obj1[[0,1,2]]

0    2
1    3
2    1
dtype: int64

NOTE: When using loc, the expression obj.loc[[0, 1, 2]] will fail when the index does not contain integers:

In [84]:
obj1.iloc[[0, 1, 2]]

2    1
0    2
1    3
dtype: int64

In [86]:
obj2.iloc[[0, 1, 2]]

a    1
b    2
c    3
dtype: int64

In [88]:
obj2.loc['b':'c'] = 5
obj2

a    1
b    5
c    5
dtype: int64

In [147]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], 
                                                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [148]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [149]:
data[["three", "one"]]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


Indexing like this has a few special cases. The first is slicing or selecting data with a Boolean array:

In [150]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [151]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [152]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [153]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# Selection on DataFrame with loc and iloc

In [154]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [155]:
data.loc["Colorado"]

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [156]:
data.loc[["Colorado", "New York"]]   #To select multiple roles, pass a sequence of labels:

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


You can combine both row and column selection in loc by separating the selections with a comma:



In [157]:
data.loc["Colorado", ["two", "three"]]

two      5
three    6
Name: Colorado, dtype: int64

In [158]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [159]:
data.iloc[[2, 1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [160]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [161]:
ser = pd.Series(np.arange(3.), index=["a", "b", "c"])
ser

a    0.0
b    1.0
c    2.0
dtype: float64

 Note : The loc (for labels) or iloc (for integers)

In [162]:
ser.iloc[-1]   # prints the item at the 2nd index

2.0

In [163]:
ser[:2]

a    0.0
b    1.0
dtype: float64

In [164]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [165]:
data.loc[:, "one"] = 1
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,1,9,10,11
New York,1,13,14,15


In [166]:
data.iloc[2] = 5
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,5,5,5,5
New York,1,13,14,15


In [167]:
data.loc[data["four"] > 5] = 3
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [168]:
data.loc[data.three == 5, 'three'] = 6
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


# Arithmetic and Data Alignment


In [173]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [174]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [175]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [176]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [177]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [178]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


If you add DataFrame objects with no column or row labels in common, the result will contain all nulls:

In [179]:
df1 = pd.DataFrame({"A": [1, 2]})
df1

Unnamed: 0,A
0,1
1,2


In [180]:
df2 = pd.DataFrame({"B": [3, 4]})
df2

Unnamed: 0,B
0,3
1,4


In [181]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


# Arithmetic methods with fill values


In [182]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [183]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [186]:
df2.loc[1, "b"] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [187]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [188]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [189]:
df1.rdiv(1)     # has arguments reversed and divided by 1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [191]:
df1.reindex(columns=df2.columns, fill_value=0)


Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


# Operations between DataFrame and Series


In [194]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [195]:
arr[0]

array([0., 1., 2., 3.])

In [196]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [13]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),columns=list("bde"),
                     index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [14]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [15]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [16]:
series2 = pd.Series(np.arange(3), index=["b", "e", "f"])
series2

b    0
e    1
f    2
dtype: int64

In [17]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [18]:
series3 = frame["d"]
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [19]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [23]:
frame.sub(series3, axis="index")

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


# Function Application and Mapping


In [24]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)),columns=list("bde"),
                    index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,0.363156,-0.626219,0.212324
Ohio,0.019639,0.233911,-0.28414
Texas,-0.354842,-0.578264,0.813487
Oregon,-0.038499,-0.216107,1.042834


In [25]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.363156,0.626219,0.212324
Ohio,0.019639,0.233911,0.28414
Texas,0.354842,0.578264,0.813487
Oregon,0.038499,0.216107,1.042834


NOTE: Another frequent operation is applying a function on one-dimensional arrays to each column or row. 
DataFrame’s apply method does exactly this:

In [26]:
def f1(x):
    return x.max() - x.min()

In [27]:
frame.apply(f1)

b    0.717998
d    0.860130
e    1.326974
dtype: float64

In [28]:
frame.apply(f1, axis="columns")   # If you pass axis="columns" to apply, the function will be invoked once per row instead

Utah      0.989374
Ohio      0.518051
Texas     1.391751
Oregon    1.258941
dtype: float64

In [29]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=["min", "max"])

In [30]:
frame.apply(f2)

Unnamed: 0,b,d,e
min,-0.354842,-0.626219,-0.28414
max,0.363156,0.233911,1.042834


In [221]:
def my_format(x):
    return f"{x:.2f}"

In [227]:
frame.applymap(my_format)

  frame.applymap(my_format)


Unnamed: 0,b,d,e
Utah,-0.07,-0.6,0.17
Ohio,0.03,-0.43,0.68
Texas,-0.8,0.28,-1.2
Oregon,-0.31,-0.2,0.02


The reason for the name applymap is that Series has a map method for applying an element-wise function:

In [223]:
frame["e"].map(my_format)

Utah       0.17
Ohio       0.68
Texas     -1.20
Oregon     0.02
Name: e, dtype: object

# Sorting and Ranking


In [31]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [36]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

With a DataFrame, you can sort by index on either axis:

In [37]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"],
                   columns=["d", "a", "b", "c"])
frame 

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [38]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [42]:
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


The data is sorted in ascending order by default but can be sorted in descending order, too:

In [43]:
frame.sort_index(axis="columns", ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


To sort a Series by its values, use its sort_values method:



In [46]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [9]:
import numpy as np
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [11]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [12]:
frame.sort_values('b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [13]:
frame.sort_values(["a", "b"])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [14]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [15]:
obj.rank()  #assigns ranks to values in the Series, where the lowest value gets rank 1

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [16]:
obj.rank(method="first")   # Ranks can also be assigned according to the order in which they’re observed in the data

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [29]:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [18]:
frame.rank(axis="columns")

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


# Axis Indexes with Duplicate Labels


In [19]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [22]:
obj.index.is_unique

False

In [23]:
obj['a']

a    0
a    1
dtype: int64

In [24]:
df = pd.DataFrame(np.random.standard_normal((5, 3)), index=["a", "a", "b", "b", "c"])
df

Unnamed: 0,0,1,2
a,-0.347813,1.940197,-1.526067
a,0.566162,0.13142,-0.090132
b,0.723241,-1.263464,-0.960964
b,-0.362218,-0.266276,0.719489
c,-0.193768,-0.78713,0.376003


# 5.3 Summarizing and Computing Descriptive Statistics


In [48]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"], columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [55]:
df.sum()

one    9.25
two   -5.80
dtype: float64

Passing axis="columns" or axis=1 sums across the columns instead:



In [27]:
df.sum(axis="columns")   # calculates the sum of each row (across columns)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [30]:
df.mean(axis="columns")    # same as below

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [31]:
df.mean(axis=1)  

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [32]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [33]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


# Unique Values, Value Counts, and Membership



In [56]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [60]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [53]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [43]:
mask = obj.isin(["b", "c"])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [44]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [54]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4], "Qu2": [2, 3, 1, 2, 3], "Qu3": [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [59]:
data["Qu1"].value_counts().sort_index()    # computing the value counts for a single column

Qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [61]:
data = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [0, 0, 1, 0, 0]})
data

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [62]:
data.value_counts()

a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64

# Mini Project

In [53]:
import pandas as pd
import numpy as np

# Create a small DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 28],
    'Score': [85.0, 92.5, 88.0, 76.0, 90.0]
}

df = pd.DataFrame(data, index=['a','b','c','d','e'])
print(df)


      Name  Age  Score
a    Alice   25   85.0
b      Bob   30   92.5
c  Charlie   35   88.0
d    David   40   76.0
e      Eva   28   90.0


# Basic Numpy Statistics

In [54]:
df['Name'] = np.arange(5)
df

Unnamed: 0,Name,Age,Score
a,0,25,85.0
b,1,30,92.5
c,2,35,88.0
d,3,40,76.0
e,4,28,90.0


In [55]:
np.mean(df['Score'])    # mean score

86.3

In [56]:
np.sum(df['Score'])     # sum of score

431.5

In [57]:
np.std(df['Score'])    # standard deviation of score

5.706137047074843

# Filter, Sort, Summarize

In [58]:
df[df['Age'] < 30]

Unnamed: 0,Name,Age,Score
a,0,25,85.0
e,4,28,90.0


In [59]:
df.sort_values(['Score'], ascending = False)

Unnamed: 0,Name,Age,Score
b,1,30,92.5
e,4,28,90.0
c,2,35,88.0
a,0,25,85.0
d,3,40,76.0
