# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [2]:
import pandas as pd
import numpy as np
from collections import namedtuple
import os

In [2]:
df_lis = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], index=[4, 5, 6], columns=['A', 'B', 'C'])
df_lis

Unnamed: 0,A,B,C
4,0,2,3
5,0,4,1
6,10,20,30


In [3]:
#iat[] just like "at[]", but in this case we use labeles 
df_lis.iat[1,2]

1

In [4]:
from numpy.random import randn
np.random.seed(101)

In [5]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"])
}
df_1 = pd.DataFrame(d)
df_1

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [6]:
pd.DataFrame(d, ['b', 'c'])

Unnamed: 0,one,two
b,2.0,2.0
c,3.0,3.0


In [7]:
d_2 = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
df2 = pd.DataFrame(d_2)
df2

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [8]:
def min_max(row): 
    data = row[['one', 'two']]
    return pd.Series({'min' : np.min(data), 'max' : np.max(data)})

df2.apply(min_max, axis='columns')

Unnamed: 0,min,max
0,1.0,4.0
1,2.0,3.0
2,2.0,3.0
3,1.0,4.0


In [9]:
def min_max2(row): 
    data = row[['one', 'two']]
    row['max'] = np.max(data)
    row['min'] = np.min(data)
    return row 

df2.apply(min_max2, axis='columns')

Unnamed: 0,one,two,max,min
0,1.0,4.0,4.0,1.0
1,2.0,3.0,3.0,2.0
2,3.0,2.0,3.0,2.0
3,4.0,1.0,4.0,1.0


In [10]:
rows = ['one', 'two']
df2.apply(lambda x: np.max(x[rows]), axis=1)

0    4.0
1    3.0
2    3.0
3    4.0
dtype: float64

In [11]:
#if you pass orient = 'index', the keys will be the row labels
pd.DataFrame.from_dict(d_2, orient='index' ,columns=['B','E','B','O']) #DataFrame.from_dict takes a dict of dicts or a dict of array-like sequences and returns a DataFrame.

Unnamed: 0,B,E,B.1,O
one,1.0,2.0,3.0,4.0
two,4.0,3.0,2.0,1.0


In [12]:
 pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [13]:
#. Like dictionaries they contain keys that are hashed to a particular value. But on contrary, 
#it supports both access from key value and iteration, 
#the functionality that dictionaries lack.

Point = namedtuple('Point', 'x y')
pd.DataFrame([Point(0,0), Point(0,3), (3,2)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,3,2


In [14]:
pd.DataFrame(d_2, index = ['a','b','c','d'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [15]:
data_1 = np.zeros((2,), dtype = [('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data_1

array([(0, 0., b''), (0, 0., b'')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [16]:
data_1[:] = [(1,2.0,'hello'), (2,3.0,'world')]
pd.DataFrame(data_1)

Unnamed: 0,A,B,C
0,1,2.0,b'hello'
1,2,3.0,b'world'


In [17]:
#From a list of dicts
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [18]:
#From a dict of tuples
pd.DataFrame(
    {
     ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
     ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
     ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
     ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
     ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)


Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [19]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [20]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [21]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [22]:
# Pass a list of column names
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [23]:
# SQL Syntax (NOT RECOMMENDED!)
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

DataFrame Columns are just Series

In [24]:
type(df['W'])

pandas.core.series.Series

In [25]:
import re
string = 'bat, lat, mat, bet, let, met, bit, lit, mit, bot, lot, mot'
result = re.findall('b[ao]t', string)
print(result)

['bat', 'bot']


**Creating a new column:**

In [26]:
df['new'] = df['W'] + df['Y']

In [27]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


** Removing Columns**

In [28]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [29]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [30]:
#we can deleted or popped like with a dict: 
del df['X']
pop_df = df.pop('Y')
df

Unnamed: 0,W,Z,new
A,2.70685,0.503826,3.614819
B,0.651118,0.605965,-0.196959
C,-2.018168,-0.589001,-1.489355
D,0.188695,0.955057,-0.744542
E,0.190794,0.683509,2.796762


In [31]:
df.drop('new',axis=1,inplace=True)

In [32]:
df

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


Can also drop rows this way:

In [33]:
df.drop('E',axis=0)

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057


** Selecting Rows**

In [34]:
df.loc['A']

W    2.706850
Z    0.503826
Name: A, dtype: float64

Or select based off of position instead of label 

In [35]:
df.iloc[2]

W   -2.018168
Z   -0.589001
Name: C, dtype: float64

** Selecting subset of rows and columns **

In [36]:
df.loc['B','W']

0.6511179479432686

In [37]:
df

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [38]:
df.loc[['A','B'],['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965


In [39]:
#insert 
df.insert(1, 'X', randn(5))

In [40]:
df

Unnamed: 0,W,X,Z
A,2.70685,0.302665,0.503826
B,0.651118,1.693723,0.605965
C,-2.018168,-1.706086,-0.589001
D,0.188695,-1.159119,0.955057
E,0.190794,-0.134841,0.683509


In [41]:
#assign() allows you to easily create new columns that are potentially derived from existing columns.
df_2 = pd.DataFrame({'temp_c': [17.0, 25.0]},
                  index=['Portland', 'Berkeley'])
df_2

Unnamed: 0,temp_c
Portland,17.0
Berkeley,25.0


In [42]:
df_2.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)

Unnamed: 0,temp_c,temp_f
Portland,17.0,62.6
Berkeley,25.0,77.0


In [43]:
df.assign(sumemos=lambda x: x.W + x.X)
#assign always returns a copy of the data, leaving the original DataFrame untouched.


Unnamed: 0,W,X,Z,sumemos
A,2.70685,0.302665,0.503826,3.009515
B,0.651118,1.693723,0.605965,2.344841
C,-2.018168,-1.706086,-0.589001,-3.724254
D,0.188695,-1.159119,0.955057,-0.970424
E,0.190794,-0.134841,0.683509,0.055954


In [44]:
df

Unnamed: 0,W,X,Z
A,2.70685,0.302665,0.503826
B,0.651118,1.693723,0.605965
C,-2.018168,-1.706086,-0.589001
D,0.188695,-1.159119,0.955057
E,0.190794,-0.134841,0.683509


In [45]:
df_2

Unnamed: 0,temp_c
Portland,17.0
Berkeley,25.0


In [46]:
df_Excel = pd.read_excel('/Users/omar/Documents/Refactored_Py_DS_ML_Bootcamp-master/03-Python-for-Data-Analysis-Pandas/Excel_Sample.xlsx',  engine='openpyxl')
#xlrd has removed support for anything other than xls files in their latest releas :( 
df_Excel.pop('Unnamed: 0')

0    0
1    1
2    2
3    3
Name: Unnamed: 0, dtype: int64

In [47]:
df_Excel

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [48]:
df_Excel.assign(dividamos_a_b = lambda x: x['a'] / x['b'])

Unnamed: 0,a,b,c,d,dividamos_a_b
0,0,1,2,3,0.0
1,4,5,6,7,0.8
2,8,9,10,11,0.888889
3,12,13,14,15,0.923077


In [49]:
df_Excel['a'].gt(8) #Equivalent to series > other

0    False
1    False
2    False
3     True
Name: a, dtype: bool

In [50]:
df_Excel['a'].lt(8) #Equivalent to series < other

0     True
1     True
2    False
3    False
Name: a, dtype: bool

In [51]:
df_Excel[df_Excel['a'].lt(8)]

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7


In [52]:
#Boolean operators work as well:
df_boolean = pd.DataFrame({'a': [1,0,1], 'b': [0,1,1]}, dtype=bool)
df_boolean2 = pd.DataFrame({'a': [0,1,1], 'b': [1,1,0]}, dtype=bool)

df_boolean & df_boolean2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [53]:
df_boolean | df_boolean2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [54]:
df_boolean ^ df_boolean2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [55]:
-df_boolean

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [56]:
arr_bool = np.asarray(df_boolean)
arr_bool

array([[ True, False],
       [False,  True],
       [ True,  True]])

In [57]:
df_Excel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       4 non-null      int64
 1   b       4 non-null      int64
 2   c       4 non-null      int64
 3   d       4 non-null      int64
dtypes: int64(4)
memory usage: 256.0 bytes


In [58]:
#we use head() to see the first five elements and tail() to see the last five 

### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [59]:
df

Unnamed: 0,W,X,Z
A,2.70685,0.302665,0.503826
B,0.651118,1.693723,0.605965
C,-2.018168,-1.706086,-0.589001
D,0.188695,-1.159119,0.955057
E,0.190794,-0.134841,0.683509


In [60]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,w,x,z
A,2.70685,0.302665,0.503826
B,0.651118,1.693723,0.605965
C,-2.018168,-1.706086,-0.589001
D,0.188695,-1.159119,0.955057
E,0.190794,-0.134841,0.683509


In [61]:
df>0

Unnamed: 0,w,x,z
A,True,True,True
B,True,True,True
C,False,False,False
D,True,False,True
E,True,False,True


In [62]:
df[df>0]

Unnamed: 0,w,x,z
A,2.70685,0.302665,0.503826
B,0.651118,1.693723,0.605965
C,,,
D,0.188695,,0.955057
E,0.190794,,0.683509


In [63]:
df[df['w']>0]

Unnamed: 0,w,x,z
A,2.70685,0.302665,0.503826
B,0.651118,1.693723,0.605965
D,0.188695,-1.159119,0.955057
E,0.190794,-0.134841,0.683509


In [64]:
df[df['w']>0]['x']

A    0.302665
B    1.693723
D   -1.159119
E   -0.134841
Name: x, dtype: float64

In [65]:
df[df['w']>0][['x','z']]

Unnamed: 0,x,z
A,0.302665,0.503826
B,1.693723,0.605965
D,-1.159119,0.955057
E,-0.134841,0.683509


For two conditions you can use | and & with parenthesis:

In [66]:
df[(df['w']>0) & (df['x'] > 1)]

Unnamed: 0,w,x,z
B,0.651118,1.693723,0.605965


## More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [67]:
df

Unnamed: 0,w,x,z
A,2.70685,0.302665,0.503826
B,0.651118,1.693723,0.605965
C,-2.018168,-1.706086,-0.589001
D,0.188695,-1.159119,0.955057
E,0.190794,-0.134841,0.683509


In [68]:
# Reset to default 0,1...n index
df.reset_index()

Unnamed: 0,index,w,x,z
0,A,2.70685,0.302665,0.503826
1,B,0.651118,1.693723,0.605965
2,C,-2.018168,-1.706086,-0.589001
3,D,0.188695,-1.159119,0.955057
4,E,0.190794,-0.134841,0.683509


In [69]:
newind = 'CA NY WY OR CO'.split()

In [70]:
df['States'] = newind

In [71]:
df

Unnamed: 0,w,x,z,States
A,2.70685,0.302665,0.503826,CA
B,0.651118,1.693723,0.605965,NY
C,-2.018168,-1.706086,-0.589001,WY
D,0.188695,-1.159119,0.955057,OR
E,0.190794,-0.134841,0.683509,CO


In [72]:
df.set_index('States')

Unnamed: 0_level_0,w,x,z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CA,2.70685,0.302665,0.503826
NY,0.651118,1.693723,0.605965
WY,-2.018168,-1.706086,-0.589001
OR,0.188695,-1.159119,0.955057
CO,0.190794,-0.134841,0.683509


In [73]:
df

Unnamed: 0,w,x,z,States
A,2.70685,0.302665,0.503826,CA
B,0.651118,1.693723,0.605965,NY
C,-2.018168,-1.706086,-0.589001,WY
D,0.188695,-1.159119,0.955057,OR
E,0.190794,-0.134841,0.683509,CO


In [74]:
df.set_index('States',inplace=True)

In [75]:
df

Unnamed: 0_level_0,w,x,z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CA,2.70685,0.302665,0.503826
NY,0.651118,1.693723,0.605965
WY,-2.018168,-1.706086,-0.589001
OR,0.188695,-1.159119,0.955057
CO,0.190794,-0.134841,0.683509


In [76]:
df['w'][:2]

States
CA    2.706850
NY    0.651118
Name: w, dtype: float64

In [77]:
#Pandas insert method allows the user to insert a column in a dataframe or series(1-D Data frame)
df.insert(3,"From insert", "Rows from insert")
df

Unnamed: 0_level_0,w,x,z,From insert
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.302665,0.503826,Rows from insert
NY,0.651118,1.693723,0.605965,Rows from insert
WY,-2.018168,-1.706086,-0.589001,Rows from insert
OR,0.188695,-1.159119,0.955057,Rows from insert
CO,0.190794,-0.134841,0.683509,Rows from insert


In [78]:
df_condition = df['w']>0
df_condition

States
CA     True
NY     True
WY    False
OR     True
CO     True
Name: w, dtype: bool

In [79]:
#you can just lay it on top of the data to "hide" the data you don´t want, wich is represented by all of the False 
#values,we do this by usind the .where() function on the original DataFrame

df.where(df_condition)

Unnamed: 0_level_0,w,x,z,From insert
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.302665,0.503826,Rows from insert
NY,0.651118,1.693723,0.605965,Rows from insert
WY,,,,
OR,0.188695,-1.159119,0.955057,Rows from insert
CO,0.190794,-0.134841,0.683509,Rows from insert


In [80]:
df.where(df_condition).dropna()

Unnamed: 0_level_0,w,x,z,From insert
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.302665,0.503826,Rows from insert
NY,0.651118,1.693723,0.605965,Rows from insert
OR,0.188695,-1.159119,0.955057,Rows from insert
CO,0.190794,-0.134841,0.683509,Rows from insert


In [81]:
df[df['w']>0]

Unnamed: 0_level_0,w,x,z,From insert
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.302665,0.503826,Rows from insert
NY,0.651118,1.693723,0.605965,Rows from insert
OR,0.188695,-1.159119,0.955057,Rows from insert
CO,0.190794,-0.134841,0.683509,Rows from insert


## Merge function

In [82]:
staff_df = pd.DataFrame([{'Name': 'Bebo', 'Role': 'Student'}, {'Name': 'Beba', 'Role': 'Student'}, {'Name': 'Bo', 'Role': 'Buelín'}])
staff_df = staff_df.set_index('Name')

student_df = pd.DataFrame([{'Name': 'Bebo', 'University': 'UNAM'}, {'Name': 'Cipi', 'University': 'UNAM'}, {'Name': 'Miau', 'University': 'Michi'}])
student_df = student_df.set_index('Name')

print(staff_df)
print(student_df)

         Role
Name         
Bebo  Student
Beba  Student
Bo     Buelín
     University
Name           
Bebo       UNAM
Cipi       UNAM
Miau      Michi


In [83]:
#importantly, both DataFrames are indexed along the value we want to merge them on, wich is called Name
pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,University
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Beba,Student,
Bebo,Student,UNAM
Bo,Buelín,
Cipi,,UNAM
Miau,,Michi


In [84]:
pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True) #default how='inner' 

Unnamed: 0_level_0,Role,University
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bebo,Student,UNAM


In [85]:
#you could probably guess that comes next. We want a list of all of the students and their roles if they were
#also staff. To do this we would do a right join 
pd.merge(staff_df, student_df, how='right', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,University
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bebo,Student,UNAM
Cipi,,UNAM
Miau,,Michi


In [86]:
pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,University
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bebo,Student,UNAM
Beba,Student,
Bo,Buelín,


In [87]:
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # Column names
    f.write('NA,Pave,127500\n')  # Each row represents a data example
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [88]:
data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [89]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [90]:
'''
Since the “Alley” column only takes two types of categorical values “Pave” and “NaN”,
pandas can automatically convert this column to two columns “Alley_Pave” and “Alley_nan”.
'''
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


## Multi-Index and Index Hierarchy

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

In [91]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [92]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [93]:
df_multi = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df_multi

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.390528,0.166905
G1,2,0.184502,0.807706
G1,3,0.07296,0.638787
G2,1,0.329646,-0.497104
G2,2,-0.75407,-0.943406
G2,3,0.484752,-0.116773


Now let's show how to index this! For index hierarchy we use df.loc[], if this was on the columns axis, you would just use normal bracket notation df[]. Calling one level of the index returns the sub-dataframe:

In [94]:
df_multi.loc['G1']

Unnamed: 0,A,B
1,0.390528,0.166905
2,0.184502,0.807706
3,0.07296,0.638787


In [95]:
df_multi.loc['G1'].loc[1]

A    0.390528
B    0.166905
Name: 1, dtype: float64

In [96]:
df_multi.index.names

FrozenList([None, None])

In [97]:
df_multi.index.names = ['Group','Num']

In [98]:
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.390528,0.166905
G1,2,0.184502,0.807706
G1,3,0.07296,0.638787
G2,1,0.329646,-0.497104
G2,2,-0.75407,-0.943406
G2,3,0.484752,-0.116773


In [99]:
df_multi.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.390528,0.166905
2,0.184502,0.807706
3,0.07296,0.638787


In [100]:
df_2 = df_multi.xs('G1')
np.exp(df_2)

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.477761,1.181642
2,1.202619,2.242757
3,1.075687,1.894182


In [101]:
np.asarray(df_2)

array([[0.39052784, 0.16690464],
       [0.18450186, 0.80770591],
       [0.07295968, 0.63878701]])

In [102]:
df_multi.xs(['G1',1])

A    0.390528
B    0.166905
Name: (G1, 1), dtype: float64

In [103]:
df_multi.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.390528,0.166905
G2,0.329646,-0.497104


In [104]:
df_reptile = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
                            index=['cobra', 'viper', 'sidewinder'],
                            columns=['max_speed', 'shield'])
df_reptile

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [105]:
df_reptile.loc[['viper', 'sidewinder']]

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


In [106]:
df_reptile.loc['cobra', 'shield']

2

In [107]:
df_reptile.loc[df_reptile['shield'] > 6]

Unnamed: 0,max_speed,shield
sidewinder,7,8


In [108]:
df_reptile.loc['cobra'] = 10
df_reptile

Unnamed: 0,max_speed,shield
cobra,10,10
viper,4,5
sidewinder,7,8


In [120]:
df_reptile.loc[['viper', 'sidewinder'], ['shield']] = 20
df_reptile

Unnamed: 0,max_speed,shield
cobra,10,10
viper,4,20
sidewinder,7,20


In [109]:
tuples = [
    ('cobra', 'mark i'), ('cobra', 'mark ii'),
    ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'),
    ('viper', 'mark ii'), ('viper', 'mark iii')
]
index = pd.MultiIndex.from_tuples(tuples)

values_1 = [[12, 2], [0, 4], [10, 20],
            [1, 4], [7, 1], [16, 36]]

df_multi = pd.DataFrame(values_1, columns=['max_speed', 'shield'], index=index)
df_multi

Unnamed: 0,Unnamed: 1,max_speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark ii,7,1
viper,mark iii,16,36


In [110]:
df_multi.idxmin(axis=0)

max_speed    (cobra, mark ii)
shield       (viper, mark ii)
dtype: object

In [111]:
df_multi.loc['cobra']

Unnamed: 0,max_speed,shield
mark i,12,2
mark ii,0,4


In [112]:
df_multi.loc[('cobra', 'mark i')]

max_speed    12
shield        2
Name: (cobra, mark i), dtype: int64

In [113]:
df_reptile.align(df_reptile)

(            max_speed  shield
 cobra              10      10
 viper               4       5
 sidewinder          7       8,
             max_speed  shield
 cobra              10      10
 viper               4       5
 sidewinder          7       8)

In [114]:
#.convert_dtypes(), Convert columns to best possible dtypes using dtypes supporting pd.NA.
df_convert = pd.DataFrame(
    {
    "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
    "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
    "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
    "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
    "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
    "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
    } 
)
df_convert

Unnamed: 0,a,b,c,d,e,f
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


In [115]:
df_convert.dtypes

a      int32
b     object
c     object
d     object
e    float64
f    float64
dtype: object

In [116]:
dfn = df_convert['b'].convert_dtypes()
dfn

0    x
1    y
2    z
Name: b, dtype: string

In [3]:
pd.__version__

'1.2.4'

In [118]:
#Value counts (histogramming) / mode
data_count = np.random.randint(0, 7, size=50)
data_count

array([3, 6, 6, 5, 6, 1, 5, 5, 2, 2, 2, 4, 3, 3, 2, 1, 6, 3, 4, 3, 3, 4,
       6, 2, 5, 1, 4, 5, 0, 3, 6, 0, 3, 5, 6, 5, 4, 4, 3, 2, 3, 2, 1, 5,
       6, 3, 6, 4, 5, 6])

In [119]:
s_count = pd.Series(data_count)
s_count.value_counts()

3    11
6    10
5     9
2     7
4     7
1     4
0     2
dtype: int64

### idxmin and idxmax are called argmin and argmax in NumPy

# Great Job!