### Prepared by Abhishek Kumar
### https://www.linkedin.com/in/abhishek-kumar-442337b2/


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# To get multiple outputs in the same cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [3]:
# Setup : DataFrame creation

salary = [['1','Abhishek Kumar','AIML', 'Machine Learning Engineer','M', 'Y', '04051990', 1121000],
          ['2','Arjun Kumar','DM', 'Tech Lead','M', 'Y', '09031992', 109000],
          ['3','Vivek Raj','DM', 'Devops Engineer','M', 'N', np.NaN , 827000],
          ['4','Mika Singh','DM', 'Data Analyst','F', 'Y', '15101991',  np.NaN],
          ['5','Anusha Yenduri','AIML', 'Data Scientist','M', 'Y', '01011989',  921000],
          ['6','Ritesh Srivastava','AIML', 'Data Engineer','M', 'Y', np.NaN, 785000]]

columns_name=['Emp_Id','Emp_Name','Department','Role','Gender', 'WFH Status', 'DOB', 'Salary']

emp_df = pd.DataFrame(salary,columns=columns_name)
emp_df

Unnamed: 0,Emp_Id,Emp_Name,Department,Role,Gender,WFH Status,DOB,Salary
0,1,Abhishek Kumar,AIML,Machine Learning Engineer,M,Y,4051990.0,1121000.0
1,2,Arjun Kumar,DM,Tech Lead,M,Y,9031992.0,109000.0
2,3,Vivek Raj,DM,Devops Engineer,M,N,,827000.0
3,4,Mika Singh,DM,Data Analyst,F,Y,15101991.0,
4,5,Anusha Yenduri,AIML,Data Scientist,M,Y,1011989.0,921000.0
5,6,Ritesh Srivastava,AIML,Data Engineer,M,Y,,785000.0


# 1. Concatenating dataframes vertically / Appending dataframes

I ll take up 2 ways to do this.

    i. df.append()
    ii. pd.concat()

### i. df.append()

In [4]:
detail_1 = ({'Id'   : [1,2,3,4],
             'Name' : ['A','B','C','D'],
             'Age'  : [21,22,20,24] })
detail_2 = ({'Id'   : [2,8,5],
             'Name' : ['Again','H','E'],
             'Age'  : [25,18,28],
             'City' : ['Pune','Panaji','Patna']})
detail_3 = ({'Id'   : [7,6],
             'Name' : ['G','F'],
             'Age'  : [34,30] })
df1 = pd.DataFrame(detail_1)
df2 = pd.DataFrame(detail_2)
df3 = pd.DataFrame(detail_3)
df1
df2
df3

# Multiple dataframe objects can be passed as a list
df_appended_1 = df1.append([df2,df3], sort=True) # sort=True/False - sorts the column names in Alphabetical order.
df_appended_1
df_appended_2 = df1.append([df2,df3], sort=False, ignore_index=True) # ignore_index= True creates a new index for the dataframe
df_appended_2

Unnamed: 0,Id,Name,Age
0,1,A,21
1,2,B,22
2,3,C,20
3,4,D,24


Unnamed: 0,Id,Name,Age,City
0,2,Again,25,Pune
1,8,H,18,Panaji
2,5,E,28,Patna


Unnamed: 0,Id,Name,Age
0,7,G,34
1,6,F,30


Unnamed: 0,Age,City,Id,Name
0,21,,1,A
1,22,,2,B
2,20,,3,C
3,24,,4,D
0,25,Pune,2,Again
1,18,Panaji,8,H
2,28,Patna,5,E
0,34,,7,G
1,30,,6,F


Unnamed: 0,Id,Name,Age,City
0,1,A,21,
1,2,B,22,
2,3,C,20,
3,4,D,24,
4,2,Again,25,Pune
5,8,H,18,Panaji
6,5,E,28,Patna
7,7,G,34,
8,6,F,30,


In [5]:
df_appended_1 is df1

False

### Note : 
    1. join= and keys= parameters are not available in df.append.
    2. So, By default, ALL the columns are selected and index can be either retained or newly created.
    3. It makes a full copy of the data, and that constantly reusing this function can create a significant performance hit. 

### ii. pd.concat(objs, axis=0, join='outer', sort='False', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

pd.concat() is capable of concatenating dataframes either way longitudinal as well latitudinal.

In [6]:
# Implementing the pd.concat() to behave similar to df.append()
# Default action of concat is vertical join/ append, as axis=0, by default.

df_appended_3 = pd.concat([df1,df2,df3], axis = 0, join = 'outer', sort= False)
df_appended_3

Unnamed: 0,Id,Name,Age,City
0,1,A,21,
1,2,B,22,
2,3,C,20,
3,4,D,24,
0,2,Again,25,Pune
1,8,H,18,Panaji
2,5,E,28,Patna
0,7,G,34,
1,6,F,30,


In [7]:
df_appended_4 = pd.concat([df1,df2,df3], axis = 0, join = 'outer', sort = False , keys = ['a','b','c'], ignore_index = True, copy = True)
df_appended_4

# Note : Keys=a,b,c is passed, but still the keys are not assigned because the INDEXES are IGNORED. To create the keys, we need to retain the indexes.

Unnamed: 0,Id,Name,Age,City
0,1,A,21,
1,2,B,22,
2,3,C,20,
3,4,D,24,
4,2,Again,25,Pune
5,8,H,18,Panaji
6,5,E,28,Patna
7,7,G,34,
8,6,F,30,


In [8]:
# The distinct keys are created, with keys= parameter and ignore_index=False
# join=inner considers only the common columns of all dataframes

df_appended_5 = pd.concat([df1,df2,df3], axis = 0, join = 'inner', sort = True , keys = ['a','b','c'], ignore_index = False, copy = True)
df_appended_5

Unnamed: 0,Unnamed: 1,Age,Id,Name
a,0,21,1,A
a,1,22,2,B
a,2,20,3,C
a,3,24,4,D
b,0,25,2,Again
b,1,18,8,H
b,2,28,5,E
c,0,34,7,G
c,1,30,6,F


In [9]:
# The keys created above, can be used as filters.

df_appended_5.loc['b']

Unnamed: 0,Age,Id,Name
0,25,2,Again
1,18,8,H
2,28,5,E


# 1.1 Appending Rows ( as Series )

In [10]:
# Attention : Does not Work

s = pd.Series(['11', 'Eleven', 21])
s
df1
# appended_row = df1.append(s)
# appended_row = df1.append(s, ignore_index = True) # ignore_index = True fixes the error, but the output Dataframe is NOT DESIRED.
# appended_row

0        11
1    Eleven
2        21
dtype: object

Unnamed: 0,Id,Name,Age
0,1,A,21
1,2,B,22
2,3,C,20
3,4,D,24


In [11]:
s = pd.Series(['11', 'Eleven', 21], index = ['Id','Name', 'Age'])
s
df1
appended_row = df1.append(s, ignore_index = True)
appended_row

Id          11
Name    Eleven
Age         21
dtype: object

Unnamed: 0,Id,Name,Age
0,1,A,21
1,2,B,22
2,3,C,20
3,4,D,24


Unnamed: 0,Id,Name,Age
0,1,A,21
1,2,B,22
2,3,C,20
3,4,D,24
4,11,Eleven,21


# 2. Concatenating Dataframes horizontally

Using pd.concat(axis=1)

### pd.concat(objs, axis=1, join='outer', sort='False', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

In [12]:
# Set up

detail_1 = ({'Id'   : [1,2,3,4],
             'Name' : ['A','B','C','D'],
             'Age'  : [21,22,20,24] })
detail_2 = ({'Id'   : [1,2,5],
             'Sal'  : [100,200,500],
             'City' : ['Pune','Panaji',np.NaN]})

dfh_1 = pd.DataFrame(detail_1)
dfh_2 = pd.DataFrame(detail_2)
dfh_1
dfh_2

Unnamed: 0,Id,Name,Age
0,1,A,21
1,2,B,22
2,3,C,20
3,4,D,24


Unnamed: 0,Id,Sal,City
0,1,100,Pune
1,2,200,Panaji
2,5,500,


In [13]:
dfh_concat = pd.concat([dfh_1,dfh_2], axis=1, join = 'outer', ignore_index=False, sort = True )
dfh_concat

Unnamed: 0,Id,Name,Age,Id.1,Sal,City
0,1,A,21,1.0,100.0,Pune
1,2,B,22,2.0,200.0,Panaji
2,3,C,20,5.0,500.0,
3,4,D,24,,,


In [14]:
# With keys=a,b, the columns are labelled as 'a' and 'b' with ignore_index=False
# With join= parameter - is used to Set logic on the other axes 
# With join= inner, the common records are selected

dfh_concat = pd.concat([dfh_1,dfh_2], axis=1, join = 'inner', ignore_index=False ,keys = ['a','b'] , sort = False )
dfh_concat

Unnamed: 0_level_0,a,a,a,b,b,b
Unnamed: 0_level_1,Id,Name,Age,Id,Sal,City
0,1,A,21,1,100,Pune
1,2,B,22,2,200,Panaji
2,3,C,20,5,500,


In [15]:
# ignore_index= True, column labels are Re-named.
# sort= True, ssorts the records in ascending order

dfh_concat = pd.concat([dfh_1,dfh_2], axis=1, join = 'inner', ignore_index= True,sort = True )
dfh_concat

Unnamed: 0,0,1,2,3,4,5
0,1,A,21,1,100,Pune
1,2,B,22,2,200,Panaji
2,3,C,20,5,500,


# 2.1 Appending Columns ( as Series)

In [16]:
s2 = pd.Series(['_0', '_1', '_2', '_3'])

dfh_concat_s = pd.concat([dfh_1,dfh_2,s2,s2,s2], axis=1)
dfh_concat_s

Unnamed: 0,Id,Name,Age,Id.1,Sal,City,0,1,2
0,1,A,21,1.0,100.0,Pune,_0,_0,_0
1,2,B,22,2.0,200.0,Panaji,_1,_1,_1
2,3,C,20,5.0,500.0,,_2,_2,_2
3,4,D,24,,,,_3,_3,_3


In [17]:
# Does not make sense to me, as of now.

s2 = pd.Series(['_0', '_1', '_2', '_3'])

dfh_concat_s = pd.concat([dfh_1,dfh_2,s2,s2,s2], axis=0)
dfh_concat_s

s3 = pd.Series(['_0', '_1', '_2', '_3'], index = ['Id','Name', 'Age','Sal'])

dfh_concat_s = pd.concat([dfh_1,dfh_2,s3,s3,s3], axis=0, ignore_index = False)
dfh_concat_s

Unnamed: 0,0,Age,City,Id,Name,Sal
0,,21.0,,1.0,A,
1,,22.0,,2.0,B,
2,,20.0,,3.0,C,
3,,24.0,,4.0,D,
0,,,Pune,1.0,,100.0
1,,,Panaji,2.0,,200.0
2,,,,5.0,,500.0
0,_0,,,,,
1,_1,,,,,
2,_2,,,,,


Unnamed: 0,0,Age,City,Id,Name,Sal
0,,21.0,,1.0,A,
1,,22.0,,2.0,B,
2,,20.0,,3.0,C,
3,,24.0,,4.0,D,
0,,,Pune,1.0,,100.0
1,,,Panaji,2.0,,200.0
2,,,,5.0,,500.0
Id,_0,,,,,
Name,_1,,,,,
Age,_2,,,,,
