# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

* 1) Creating DataFrames
* 2) Selecting, Creating and Dropping Columns
* 3) Selecting, Creating and Dropping Rows 
* 4) Selecting subset of rows and columns
* 5) Selecting by conditions
* 6) Selecting by data type
* 7) Set New Index
* 8) Reset Index

In [109]:
import pandas as pd
import numpy as np

np.random.seed(101)

## 1) Creating DataFrames

### Using Numpy Array

In [110]:
np.random.randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [111]:
df = pd.DataFrame(np.random.randn(5,4), index=['A', 'B', 'C', 'D', 'E'], columns=['W', 'X', 'Y', 'Z'])

In [112]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


### Using Dictionary

In [113]:
df2 = pd.DataFrame({'A':[1,2,3],
                  'B':[5,6,7],
                  'C':[1,2,3],
                  'D': [10, 20, 30]}, index=['X', 'Y', 'Z'])

In [114]:
df2

Unnamed: 0,A,B,C,D
X,1,5,1,10
Y,2,6,2,20
Z,3,7,3,30


## 2) Selecting, Creating and Dropping Columns 

### Selecting Columns

In [115]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [116]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [117]:
df['X']

A    1.693723
B    0.390528
C    0.072960
D   -0.754070
E    1.901755
Name: X, dtype: float64

In [118]:
# Pass a list of column names
df[['Z','W']]

Unnamed: 0,Z,W
A,-1.159119,0.302665
B,0.184502,-0.134841
C,0.329646,0.807706
D,0.484752,-0.497104
E,1.996652,-0.116773


DataFrame Columns are just Series

In [119]:
type(df['W'])

pandas.core.series.Series

### Creating a New Column

In [120]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [121]:
df['my_list'] = [20, 50, 80, 90, 70]

In [122]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20
B,-0.134841,0.390528,0.166905,0.184502,50
C,0.807706,0.07296,0.638787,0.329646,80
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70


In [123]:
df['my_np_array'] = np.array([200, 500, 800, 900, 700])

In [124]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array
A,0.302665,1.693723,-1.706086,-1.159119,20,200
B,-0.134841,0.390528,0.166905,0.184502,50,500
C,0.807706,0.07296,0.638787,0.329646,80,800
D,-0.497104,-0.75407,-0.943406,0.484752,90,900
E,-0.116773,1.901755,0.238127,1.996652,70,700


In [125]:
df['my_ser'] = pd.Series([10,20,30,40,50], index=['B', 'A', 'E', 'D', 'C'])

In [126]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,my_ser
A,0.302665,1.693723,-1.706086,-1.159119,20,200,20
B,-0.134841,0.390528,0.166905,0.184502,50,500,10
C,0.807706,0.07296,0.638787,0.329646,80,800,50
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,40
E,-0.116773,1.901755,0.238127,1.996652,70,700,30


In [127]:
df['new'] = df['W'] + df['Y']

In [128]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,my_ser,new
A,0.302665,1.693723,-1.706086,-1.159119,20,200,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,500,10,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,800,50,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,40,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,700,30,0.121354


In [129]:
df['plus_5'] = df['my_ser'] + 5
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,my_ser,new,plus_5
A,0.302665,1.693723,-1.706086,-1.159119,20,200,20,-1.40342,25
B,-0.134841,0.390528,0.166905,0.184502,50,500,10,0.032064,15
C,0.807706,0.07296,0.638787,0.329646,80,800,50,1.446493,55
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,40,-1.44051,45
E,-0.116773,1.901755,0.238127,1.996652,70,700,30,0.121354,35


In [130]:
df['gt_85'] = df['my_np_array'] > 600
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,my_ser,new,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,20,-1.40342,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,10,0.032064,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,50,1.446493,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,40,-1.44051,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,30,0.121354,35,True


### Droping Columns

In [131]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,my_ser,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,20,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,10,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,50,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,40,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,30,35,True


In [132]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,my_ser,new,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,20,-1.40342,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,10,0.032064,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,50,1.446493,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,40,-1.44051,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,30,0.121354,35,True


In [133]:
df.drop('new', axis=1, inplace=True)

In [134]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,my_ser,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,20,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,10,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,50,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,40,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,30,35,True


In [135]:
df.drop(columns=['my_ser', 'my_np_array', 'gt_85', 'plus_5'], axis=1, inplace=True)

In [136]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20
B,-0.134841,0.390528,0.166905,0.184502,50
C,0.807706,0.07296,0.638787,0.329646,80
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70


## 3) Selecting, Creating and Dropping Rows 

### Selecting Rows

In [137]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20
B,-0.134841,0.390528,0.166905,0.184502,50
C,0.807706,0.07296,0.638787,0.329646,80
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70


In [138]:
df.loc['A']

W           0.302665
X           1.693723
Y          -1.706086
Z          -1.159119
my_list    20.000000
Name: A, dtype: float64

In [139]:
df.loc['D']

W          -0.497104
X          -0.754070
Y          -0.943406
Z           0.484752
my_list    90.000000
Name: D, dtype: float64

Or select based off of position instead of label 

In [140]:
df.iloc[3]

W          -0.497104
X          -0.754070
Y          -0.943406
Z           0.484752
my_list    90.000000
Name: D, dtype: float64

In [141]:
df.loc[['A', 'C']]

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20
C,0.807706,0.07296,0.638787,0.329646,80


In [142]:
df.iloc[[3, 4]]

Unnamed: 0,W,X,Y,Z,my_list
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70


In [143]:
df.iloc[2:5]

Unnamed: 0,W,X,Y,Z,my_list
C,0.807706,0.07296,0.638787,0.329646,80
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70


In [144]:
df.loc['A']

W           0.302665
X           1.693723
Y          -1.706086
Z          -1.159119
my_list    20.000000
Name: A, dtype: float64

In [145]:
df.loc['A']['W']

0.3026654485851825

In [146]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [147]:
df['W'].loc['A']

0.3026654485851825

In [148]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20
B,-0.134841,0.390528,0.166905,0.184502,50
C,0.807706,0.07296,0.638787,0.329646,80
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70


### Creating a New Row

In [149]:
df.loc['V'] = [10, 20, 50, 80, 90]

In [150]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20
B,-0.134841,0.390528,0.166905,0.184502,50
C,0.807706,0.07296,0.638787,0.329646,80
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70
V,10.0,20.0,50.0,80.0,90


In [151]:
df.loc['H'] = df.loc['A'] + df.loc['B']

In [152]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
B,-0.134841,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0
H,0.167825,2.084251,-1.539181,-0.974618,70.0


### Dropping Rows

In [153]:
df.drop('H', axis=0)

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
B,-0.134841,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0


In [154]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
B,-0.134841,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0
H,0.167825,2.084251,-1.539181,-0.974618,70.0


In [155]:
df.drop('H', axis=0, inplace=True)

In [156]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
B,-0.134841,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0


## 4) Selecting subset of rows and columns

In [157]:
df.loc['B']['Y']

0.16690463609281317

In [158]:
df.loc['B', 'Y']

0.16690463609281317

In [159]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905


In [160]:
df.loc[['A','B']][['W', 'Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905


In [161]:
df[['W', 'X']].loc[['A','B']]

Unnamed: 0,W,X
A,0.302665,1.693723
B,-0.134841,0.390528


## 5) Selecting by conditions

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [162]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
B,-0.134841,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0


In [163]:
df > 0

Unnamed: 0,W,X,Y,Z,my_list
A,True,True,False,False,True
B,False,True,True,True,True
C,True,True,True,True,True
D,False,False,False,True,True
E,False,True,True,True,True
V,True,True,True,True,True


In [164]:
df[df > 0]

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,,,20.0
B,,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,,,,0.484752,90.0
E,,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0


In [165]:
df['W'] > 0

A     True
B    False
C     True
D    False
E    False
V     True
Name: W, dtype: bool

In [166]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
C,0.807706,0.07296,0.638787,0.329646,80.0
V,10.0,20.0,50.0,80.0,90.0


In [167]:
df[df['W'] > 0]['Y']

A    -1.706086
C     0.638787
V    50.000000
Name: Y, dtype: float64

In [168]:
df[df['W'] > 0][['Y','X']]

Unnamed: 0,Y,X
A,-1.706086,1.693723
C,0.638787,0.07296
V,50.0,20.0


In [169]:
df[df['W'] > 0][['Y','X']].loc[['A','V']]['X']

A     1.693723
V    20.000000
Name: X, dtype: float64

In [170]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
B,-0.134841,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0


**For two conditions you can use | and & with parenthesis:**

In [171]:
(df['W'] > 0) & (df['Y'] < 1)

A     True
B    False
C     True
D    False
E    False
V    False
dtype: bool

In [172]:
(df['W'] > 0) | (df['Y'] < 1)

A    True
B    True
C    True
D    True
E    True
V    True
dtype: bool

In [173]:
~(df['W'] > 0) & (df['Y'] < 1)

A    False
B     True
C    False
D     True
E     True
V    False
dtype: bool

In [174]:
df[(df['W'] > 0) & (df['Y'] < 1)]

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
C,0.807706,0.07296,0.638787,0.329646,80.0


In [175]:
df[(df['W'] < 0) | ~(df['Y'] < 1)]

Unnamed: 0,W,X,Y,Z,my_list
B,-0.134841,0.390528,0.166905,0.184502,50.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0


**isin**

In [176]:
df['my_list'].isin([20, 70])

A     True
B    False
C    False
D    False
E     True
V    False
Name: my_list, dtype: bool

In [177]:
df[df['my_list'].isin([20, 70])]

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
E,-0.116773,1.901755,0.238127,1.996652,70.0


## 6) Selecting by data type

**Convert columns to best possible dtypes use `convert_dtypes()` or `astype()`** 

In [178]:
df3 = pd.DataFrame(
    {
        "integers": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
        "strings": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
        "booleans": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
        "strings_2": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
        "integers_2": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
        "floats": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
        "dates": pd.Series(['30/7/1993 12:26:13', '30/7/1993 12:26:13', '30/7/1993 12:26:13'], dtype=np.dtype("O"))
    }
)

In [179]:
df3

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats,dates
0,1,x,True,h,10.0,,30/7/1993 12:26:13
1,2,y,False,i,,100.5,30/7/1993 12:26:13
2,3,z,,,20.0,200.0,30/7/1993 12:26:13


In [180]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      int32  
 1   strings     3 non-null      object 
 2   booleans    2 non-null      object 
 3   strings_2   2 non-null      object 
 4   integers_2  2 non-null      float64
 5   floats      2 non-null      float64
 6   dates       3 non-null      object 
dtypes: float64(2), int32(1), object(4)
memory usage: 288.0+ bytes


In [181]:
df3 = df3.convert_dtypes()
df3

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats,dates
0,1,x,True,h,10.0,,30/7/1993 12:26:13
1,2,y,False,i,,100.5,30/7/1993 12:26:13
2,3,z,,,20.0,200.0,30/7/1993 12:26:13


In [182]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      Int32  
 1   strings     3 non-null      string 
 2   booleans    2 non-null      boolean
 3   strings_2   2 non-null      string 
 4   integers_2  2 non-null      Int64  
 5   floats      2 non-null      Float64
 6   dates       3 non-null      string 
dtypes: Float64(1), Int32(1), Int64(1), boolean(1), string(3)
memory usage: 279.0 bytes


In [183]:
df3['integers'] = df3['integers'].astype('float32')

In [184]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      float32
 1   strings     3 non-null      string 
 2   booleans    2 non-null      boolean
 3   strings_2   2 non-null      string 
 4   integers_2  2 non-null      Int64  
 5   floats      2 non-null      Float64
 6   dates       3 non-null      string 
dtypes: Float64(1), Int64(1), boolean(1), float32(1), string(3)
memory usage: 276.0 bytes


In [185]:
df3['integers'] = df3['integers'].astype('int32')

In [186]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      int32  
 1   strings     3 non-null      string 
 2   booleans    2 non-null      boolean
 3   strings_2   2 non-null      string 
 4   integers_2  2 non-null      Int64  
 5   floats      2 non-null      Float64
 6   dates       3 non-null      string 
dtypes: Float64(1), Int64(1), boolean(1), int32(1), string(3)
memory usage: 276.0 bytes


In [187]:
df3['dates'] = pd.to_datetime(df3['dates'], format="%d/%m/%Y %H:%M:%S")
df3

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats,dates
0,1,x,True,h,10.0,,1993-07-30 12:26:13
1,2,y,False,i,,100.5,1993-07-30 12:26:13
2,3,z,,,20.0,200.0,1993-07-30 12:26:13


In [188]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   integers    3 non-null      int32         
 1   strings     3 non-null      string        
 2   booleans    2 non-null      boolean       
 3   strings_2   2 non-null      string        
 4   integers_2  2 non-null      Int64         
 5   floats      2 non-null      Float64       
 6   dates       3 non-null      datetime64[ns]
dtypes: Float64(1), Int64(1), boolean(1), datetime64[ns](1), int32(1), string(2)
memory usage: 276.0 bytes


**Select with data type**

In [189]:
df3.select_dtypes(include=['number'])

Unnamed: 0,integers,integers_2,floats
0,1,10.0,
1,2,,100.5
2,3,20.0,200.0


In [190]:
df3.select_dtypes(include=['int32'])

Unnamed: 0,integers
0,1
1,2
2,3


In [191]:
df3.select_dtypes(include=['int64'])

Unnamed: 0,integers_2
0,10.0
1,
2,20.0


In [192]:
df3.select_dtypes(include=['float32'])

0
1
2


In [193]:
df3.select_dtypes(include=['float64'])

Unnamed: 0,floats
0,
1,100.5
2,200.0


In [194]:
df3.select_dtypes(include=['bool'])

Unnamed: 0,booleans
0,True
1,False
2,


In [195]:
df3.select_dtypes(include=['string'])

Unnamed: 0,strings,strings_2
0,x,h
1,y,i
2,z,


In [196]:
df3.select_dtypes(include=['datetime'])

Unnamed: 0,dates
0,1993-07-30 12:26:13
1,1993-07-30 12:26:13
2,1993-07-30 12:26:13


In [197]:
df3.select_dtypes(exclude=['number'])

Unnamed: 0,strings,booleans,strings_2,dates
0,x,True,h,1993-07-30 12:26:13
1,y,False,i,1993-07-30 12:26:13
2,z,,,1993-07-30 12:26:13


In [198]:
df3.select_dtypes(exclude=['datetime'])

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


## 7) Set New Index

Let's discuss some more features of indexing, including resetting the index or setting it something else.

In [199]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20.0
B,-0.134841,0.390528,0.166905,0.184502,50.0
C,0.807706,0.07296,0.638787,0.329646,80.0
D,-0.497104,-0.75407,-0.943406,0.484752,90.0
E,-0.116773,1.901755,0.238127,1.996652,70.0
V,10.0,20.0,50.0,80.0,90.0


In [200]:
newind = ['CA', 'NY', 'WY', 'OR', 'CO', 'HO']

In [201]:
df['States'] = newind

In [202]:
df

Unnamed: 0,W,X,Y,Z,my_list,States
A,0.302665,1.693723,-1.706086,-1.159119,20.0,CA
B,-0.134841,0.390528,0.166905,0.184502,50.0,NY
C,0.807706,0.07296,0.638787,0.329646,80.0,WY
D,-0.497104,-0.75407,-0.943406,0.484752,90.0,OR
E,-0.116773,1.901755,0.238127,1.996652,70.0,CO
V,10.0,20.0,50.0,80.0,90.0,HO


In [203]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z,my_list
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20.0
NY,-0.134841,0.390528,0.166905,0.184502,50.0
WY,0.807706,0.07296,0.638787,0.329646,80.0
OR,-0.497104,-0.75407,-0.943406,0.484752,90.0
CO,-0.116773,1.901755,0.238127,1.996652,70.0
HO,10.0,20.0,50.0,80.0,90.0


In [204]:
df

Unnamed: 0,W,X,Y,Z,my_list,States
A,0.302665,1.693723,-1.706086,-1.159119,20.0,CA
B,-0.134841,0.390528,0.166905,0.184502,50.0,NY
C,0.807706,0.07296,0.638787,0.329646,80.0,WY
D,-0.497104,-0.75407,-0.943406,0.484752,90.0,OR
E,-0.116773,1.901755,0.238127,1.996652,70.0,CO
V,10.0,20.0,50.0,80.0,90.0,HO


In [205]:
df.set_index('States', inplace=True, drop=False)

In [206]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,States
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20.0,CA
NY,-0.134841,0.390528,0.166905,0.184502,50.0,NY
WY,0.807706,0.07296,0.638787,0.329646,80.0,WY
OR,-0.497104,-0.75407,-0.943406,0.484752,90.0,OR
CO,-0.116773,1.901755,0.238127,1.996652,70.0,CO
HO,10.0,20.0,50.0,80.0,90.0,HO


In [207]:
df.set_index('Y', inplace=True, drop=False)

In [208]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,States
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-1.706086,0.302665,1.693723,-1.706086,-1.159119,20.0,CA
0.166905,-0.134841,0.390528,0.166905,0.184502,50.0,NY
0.638787,0.807706,0.07296,0.638787,0.329646,80.0,WY
-0.943406,-0.497104,-0.75407,-0.943406,0.484752,90.0,OR
0.238127,-0.116773,1.901755,0.238127,1.996652,70.0,CO
50.0,10.0,20.0,50.0,80.0,90.0,HO


In [209]:
df.set_index('W', inplace=True)

In [210]:
df

Unnamed: 0_level_0,X,Y,Z,my_list,States
W,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.302665,1.693723,-1.706086,-1.159119,20.0,CA
-0.134841,0.390528,0.166905,0.184502,50.0,NY
0.807706,0.07296,0.638787,0.329646,80.0,WY
-0.497104,-0.75407,-0.943406,0.484752,90.0,OR
-0.116773,1.901755,0.238127,1.996652,70.0,CO
10.0,20.0,50.0,80.0,90.0,HO


In [211]:
df.set_index('States', inplace=True)

In [212]:
df

Unnamed: 0_level_0,X,Y,Z,my_list
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1.693723,-1.706086,-1.159119,20.0
NY,0.390528,0.166905,0.184502,50.0
WY,0.07296,0.638787,0.329646,80.0
OR,-0.75407,-0.943406,0.484752,90.0
CO,1.901755,0.238127,1.996652,70.0
HO,20.0,50.0,80.0,90.0


## 8) Reset Index

resetting index to numbers

In [213]:
df.reset_index()

Unnamed: 0,States,X,Y,Z,my_list
0,CA,1.693723,-1.706086,-1.159119,20.0
1,NY,0.390528,0.166905,0.184502,50.0
2,WY,0.07296,0.638787,0.329646,80.0
3,OR,-0.75407,-0.943406,0.484752,90.0
4,CO,1.901755,0.238127,1.996652,70.0
5,HO,20.0,50.0,80.0,90.0


In [214]:
df

Unnamed: 0_level_0,X,Y,Z,my_list
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1.693723,-1.706086,-1.159119,20.0
NY,0.390528,0.166905,0.184502,50.0
WY,0.07296,0.638787,0.329646,80.0
OR,-0.75407,-0.943406,0.484752,90.0
CO,1.901755,0.238127,1.996652,70.0
HO,20.0,50.0,80.0,90.0


In [215]:
df.reset_index(inplace=True)

In [216]:
df

Unnamed: 0,States,X,Y,Z,my_list
0,CA,1.693723,-1.706086,-1.159119,20.0
1,NY,0.390528,0.166905,0.184502,50.0
2,WY,0.07296,0.638787,0.329646,80.0
3,OR,-0.75407,-0.943406,0.484752,90.0
4,CO,1.901755,0.238127,1.996652,70.0
5,HO,20.0,50.0,80.0,90.0


# Great Job!