In [1]:
import pandas as pd
import numpy as np

# Align, Reindexing and Renaming Labels

In [3]:
series_1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
series_1

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [4]:
s1_1 = series_1[:4]
s1_1

a    0
b    1
c    2
d    3
dtype: int32

In [5]:
s1_2 = series_1[1:]
s1_2

b    1
c    2
d    3
e    4
dtype: int32

In [6]:
s1_2[-1] # obtain the last value

4

Pandas Align basically helps to align the two dataframes or series have the same row and/or column configuration
Align is used to synchronize a dataframe with another dataframe. aligns them so that the two dataframes have the same row and/or column configuration

In [7]:
# can align both series by the union of their indexes
s1_1.align(s1_2)

(a    0.0
 b    1.0
 c    2.0
 d    3.0
 e    NaN
 dtype: float64, a    NaN
 b    1.0
 c    2.0
 d    3.0
 e    4.0
 dtype: float64)

In [8]:
# align by (the values from s1_2 just stays in the index is the same as the first series)
s1_1.align(s1_2, join='left')

(a    0
 b    1
 c    2
 d    3
 dtype: int32, a    NaN
 b    1.0
 c    2.0
 d    3.0
 dtype: float64)

In [9]:
s1_1.align(s1_2, join='right')

(b    1.0
 c    2.0
 d    3.0
 e    NaN
 dtype: float64, b    1
 c    2
 d    3
 e    4
 dtype: int32)

In [10]:
# can see the intersection
s1_1.align(s1_2, join='inner')

(b    1
 c    2
 d    3
 dtype: int32, b    1
 c    2
 d    3
 dtype: int32)

In [22]:
arr_1 = np.random.randint(10, 50, size=(2, 3))
df_1 = pd.DataFrame(arr_1, ['A', 'B'], ['C', 'D', 'E'])

arr_2 = np.random.randint(10, 50, size=(2, 3))
df_2 = pd.DataFrame(arr_2, ['B', 'C'], ['C', 'D', 'E'])

In [23]:
df_1

Unnamed: 0,C,D,E
A,14,11,27
B,36,22,15


In [24]:
df_2

Unnamed: 0,C,D,E
B,39,33,36
C,22,32,44


In [28]:
dff = df_1.align(df_2) # return a tuple of the align

In [29]:
dff[0] # the row C from df_2 was added to the df_1

Unnamed: 0,C,D,E
A,14.0,11.0,27.0
B,36.0,22.0,15.0
C,,,


In [30]:
dff[1] # the row A from df_1 was added to the df_2

Unnamed: 0,C,D,E
A,,,
B,39.0,33.0,36.0
C,22.0,32.0,44.0


In [33]:
dfff = df_1.align(df_2, join='inner') # only the match rows and cols is obtained
dfff[0]

Unnamed: 0,C,D,E
B,36,22,15


In [34]:
dfff[1]

Unnamed: 0,C,D,E
B,39,33,36


In [35]:
df_1.align(df_2, join='left') # only the match rows and cols is obtained

(    C   D   E
 A  14  11  27
 B  36  22  15,       C     D     E
 A   NaN   NaN   NaN
 B  39.0  33.0  36.0)

In [36]:
df_1.align(df_2, join='right') # only the match rows and cols is obtained

(      C     D     E
 B  36.0  22.0  15.0
 C   NaN   NaN   NaN,     C   D   E
 B  39  33  36
 C  22  32  44)

# Reindexing - align data by the index

Reindexing in Pandas can be used to change the index of rows and columns of a DataFrame. Indexes can be used with reference to many index DataStructure associated with several pandas series or pandas DataFrame.

In [37]:
series_1

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [38]:
series_1.reindex(['c', 'b', 'a'])

c    2
b    1
a    0
dtype: int32

In [40]:
df_1.reindex(['B', 'A'])

Unnamed: 0,C,D,E
B,36,22,15
A,14,11,27


In [41]:
df_1.drop(['A'], axis=0) # 0 is row, 1 is colunm

Unnamed: 0,C,D,E
B,36,22,15


In [42]:
df_1.drop(['D'], axis=1) # 0 is row, 1 is colunm

Unnamed: 0,C,E
A,14,27
B,36,15


# Rename labels

In [46]:
# the colunmns receives a dictionary specifying the index and new name for the column 
df_1.rename(columns={'C' : 'Men', 'D' : 'Women', 'E' : 'Pets'}, inplace=True) # to fix result, use inplace=true

In [47]:
df_1

Unnamed: 0,Men,Women,Pets
A,14,11,27
B,36,22,15


# Multindexing

Multi-index allows to select more than one row and column in your index. It is a multi-level or hierarchical object for pandas object

pandas.MultiIndex(levels=None, codes=None, sortorder=None, names=None, dtype=None, copy=False, name=None, verify_integrity=True)

- levels: It is a sequence of arrays which shows the unique labels for each level.
- codes: It is also a sequence of arrays where integers at each level helps us to designate the labels in that location.
- sortorder: optional int. It helps us to sort the levels lexographically.
- dtype:data-type(size of the data which can be of 32 bits or 64 bits)
- copy: It is a boolean type parameter with default value as False. It helps us to copy the metadata.
- verify_integrity: It is a boolean type parameter with default value as True. It checks the integrity of the levels and codes - i.t if they are valid.

The zip() function returns a zip object, which is an iterator of tuples where the first item in each passed iterator is paired together, and then the second item in each passed iterator are paired together etc.

In [48]:
days = ['Day 1', 'Day 1', 'Day 1', 'Day 2', 'Day 2', 'Day 2']
meals = [1, 2, 3, 1, 2, 3]


hier_index = list(zip(days, meals))
hier_index

[('Day 1', 1),
 ('Day 1', 2),
 ('Day 1', 3),
 ('Day 2', 1),
 ('Day 2', 2),
 ('Day 2', 3)]

In [49]:
# convert tuples in rows and column
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex(levels=[['Day 1', 'Day 2'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [51]:
arr = np.random.randint(500, 700, size=(6, 2))
df_9 = pd.DataFrame(arr, hier_index, ['M', 'F'])

In [52]:
df_9

Unnamed: 0,Unnamed: 1,M,F
Day 1,1,536,571
Day 1,2,614,597
Day 1,3,622,630
Day 2,1,672,588
Day 2,2,551,503
Day 2,3,558,520


In [54]:
df_9.loc['Day 1']

Unnamed: 0,M,F
1,536,571
2,614,597
3,622,630


In [55]:
df_9.loc['Day 1'].loc[1]

M    536
F    571
Name: 1, dtype: int32

In [56]:
df_9.loc['Day 2'].loc[2].loc['F'] # or df_9.loc['Day 2'].loc[2]['F']

503

In [58]:
df_9.index.names = ['Day', 'Meal']
df_9

Unnamed: 0_level_0,Unnamed: 1_level_0,M,F
Day,Meal,Unnamed: 2_level_1,Unnamed: 3_level_1
Day 1,1,536,571
Day 1,2,614,597
Day 1,3,622,630
Day 2,1,672,588
Day 2,2,551,503
Day 2,3,558,520


The xs() function is used to get cross-section from the Series/DataFrame. This method takes a key argument to select data at a particular level of a MultiIndex.

In [62]:
df_9.xs('Day 1')

Unnamed: 0_level_0,M,F
Meal,Unnamed: 1_level_1,Unnamed: 2_level_1
1,536,571
2,614,597
3,622,630


In [63]:
# getting the calories for the first meal, from the 2 days
# The xs() method returns a specified section of the DataFrame.
# dataframe.xs(key, axis, level, drop_level)
df_9.xs(1, level='Meal')

Unnamed: 0_level_0,M,F
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
Day 1,536,571
Day 2,672,588


In [64]:
dic = {'A': ['Day 1', 'Day 1', 'Day 1', 'Day 2', 'Day 2', 'Day 2'],
       'B': [1, 2, 3, 1, 2, 3],
       'C': ['M', 'F', 'M', 'F', 'M', 'F'],
       'D': [1, 2, 3, 4, 5, 6]}
df = pd.DataFrame(dic)


Unnamed: 0,A,B,C,D
0,Day 1,1,M,1
1,Day 1,2,F,2
2,Day 1,3,M,3
3,Day 2,1,F,4
4,Day 2,2,M,5
5,Day 2,3,F,6


A pivot table is a similar operation that is commonly seen in spreadsheets and other programs that operate on tabular data. The pivot table takes simple column-wise data as input, and groups the entries into a two-dimensional table that provides a multidimensional summarization of the data.

LINK: https://jakevdp.github.io/PythonDataScienceHandbook/03.09-pivot-tables.html

In [65]:
df.pivot_table(values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,F,M
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
Day 1,1,,1.0
Day 1,2,2.0,
Day 1,3,,3.0
Day 2,1,4.0,
Day 2,2,,5.0
Day 2,3,6.0,


# cleaning data

In [66]:
dict = {'A' : [1, 2, np.nan], 'B' : [4, np.nan, np.nan], 'C' : [7., 8., 9.]}
df = pd.DataFrame(dict)
print(df)

     A    B    C
0  1.0  4.0  7.0
1  2.0  NaN  8.0
2  NaN  NaN  9.0


In [67]:
# drop missing data from dataframe
df.dropna()

Unnamed: 0,A,B,C
0,1.0,4.0,7.0


In [68]:
# drop columns with any missing data
df.dropna(axis=1)

Unnamed: 0,C
0,7.0
1,8.0
2,9.0


In [69]:
# drop row if it has at least two nan values
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0


In [71]:
# fill row iwith nan values
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,0.0,8.0
2,0.0,0.0,9.0


In [73]:
# fill row with the mean of the row when it have missing data
df.fillna(value=df['A'].mean())

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,1.5,8.0
2,1.5,1.5,9.0


In [74]:
# fill with the previous values
df.fillna(method='ffill') # ffill = foward fill

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,4.0,8.0
2,2.0,4.0,9.0


In [75]:
# fill with the previous values
df.fillna(method='bfill') # ffill = backward fill

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0
2,,,9.0
