In [4]:
import pandas as pd

# create the lists

In [2]:
# create 5 lists with the same size
names = ['bob', 'bella', 'blue', 'suzy', 'gunter', 'joe', 'holly', 'coco', 'elliot']
species = ['dog', 'cat', 'velociraptor', 'dog', 'penguin', 'squid', 'cat', 'cat', 'horse']
age = [1, 6, 70, 8, 3, 1, 2, 13, 3]
weight = [10, 5, 15, 7, 4, 1, 3, 2, 380]
color = ['brown', 'black', 'blue', 'black', 'black', 'gray', 'white', 'orange', 'white']

# create one data frame for each 3 elements of the lists

In [5]:
df1 = pd.DataFrame( {'name': names[:3],
                     'species': species[:3],
                     'age': age[:3]})
df2 = pd.DataFrame( {'name': names[3:6],
                     'species': species[3:6],
                     'age': age[3:6]})
df3 = pd.DataFrame( {'name': names[6:],
                     'species': species[6:],
                     'age': age[6:]})
print(df1, '\n')
print(df2, '\n')
print(df3)


    name       species  age
0    bob           dog    1
1  bella           cat    6
2   blue  velociraptor   70 

     name  species  age
0    suzy      dog    8
1  gunter  penguin    3
2     joe    squid    1 

     name species  age
0   holly     cat    2
1    coco     cat   13
2  elliot   horse    3


# Now imagine these are three dataframes read from different source files. Merge them (union all) into df

In [6]:
df_list = [df1, df2, df3]
df = pd.concat(df_list)
df

Unnamed: 0,name,species,age
0,bob,dog,1
1,bella,cat,6
2,blue,velociraptor,70
0,suzy,dog,8
1,gunter,penguin,3
2,joe,squid,1
0,holly,cat,2
1,coco,cat,13
2,elliot,horse,3


# Now create the dataframes with a few differences in column names: specie instead of species in df1; age - years instead of age in df3

In [7]:
# test with mismatching and missing columns
df1 = pd.DataFrame( {'name': names[:3],
                     'specie': species[:3],
                     'age': age[:3]})
df2 = pd.DataFrame( {'name': names[3:6],
                     'species': species[3:6],
                     'age': age[3:6]})
df3 = pd.DataFrame( {'name': names[6:],
                     'age - years': age[6:]})
print(df1, '\n')
print(df2, '\n')
print(df3)


    name        specie  age
0    bob           dog    1
1  bella           cat    6
2   blue  velociraptor   70 

     name  species  age
0    suzy      dog    8
1  gunter  penguin    3
2     joe    squid    1 

     name  age - years
0   holly            2
1    coco           13
2  elliot            3


# Now concat again - what does it do?

In [8]:
df_list = [df1, df2, df3]
df = pd.concat(df_list)
df

Unnamed: 0,name,specie,age,species,age - years
0,bob,dog,1.0,,
1,bella,cat,6.0,,
2,blue,velociraptor,70.0,,
0,suzy,,8.0,dog,
1,gunter,,3.0,penguin,
2,joe,,1.0,squid,
0,holly,,,,2.0
1,coco,,,,13.0
2,elliot,,,,3.0


# Note that the index column (the first, numeric value column) kept the same values as in the individual dfs. Change that to have a new unique index (using ignore_index) 

In [13]:
df1 = pd.DataFrame( {'name': names[:3],
                     'species': species[:3],
                     'age': age[:3]})
df2 = pd.DataFrame( {'name': names[3:6],
                     'species': species[3:6],
                     'age': age[3:6]})
df3 = pd.DataFrame( {'name': names[6:],
                     'species': species[6:],
                     'age': age[6:]})
# since we didn't define the indexes when creating the dataframes we can ignore them when concatenating
df_list = [df1, df2, df3]
df = pd.concat(df_list, ignore_index=True)
df

Unnamed: 0,name,species,age
0,bob,dog,1
1,bella,cat,6
2,blue,velociraptor,70
3,suzy,dog,8
4,gunter,penguin,3
5,joe,squid,1
6,holly,cat,2
7,coco,cat,13
8,elliot,horse,3


# Now concatenate the following columns weight and color
this could be done one column at a time (df['color'] = color), however, we want to do it by concat of the whole dataframe (using axis=1 on concat)

In [11]:
df4 = pd.DataFrame( {'weight': weight,
                     'color': color
                     })
df4

Unnamed: 0,weight,color
0,10,brown
1,5,black
2,15,blue
3,7,black
4,4,black
5,1,gray
6,3,white
7,2,orange
8,380,white


In [14]:
df = pd.concat([df, df4], axis=1)
df

Unnamed: 0,name,species,age,weight,color
0,bob,dog,1,10,brown
1,bella,cat,6,5,black
2,blue,velociraptor,70,15,blue
3,suzy,dog,8,7,black
4,gunter,penguin,3,4,black
5,joe,squid,1,1,gray
6,holly,cat,2,3,white
7,coco,cat,13,2,orange
8,elliot,horse,3,380,white


# Problem with this approach: we are not doing a join as in SQL, we are doing the join based on looking up the key and matching. It is simply concatenating based on row

# Create a dataset df5 with name and score (new column with a score for each row) for later joining

In [20]:
df5 = pd.DataFrame( {'name': names,
                     'score': [9,10,10,8,6,9,3,4,10]})
# make column index
df5.set_index('name', inplace=True)

df5

Unnamed: 0_level_0,score
name,Unnamed: 1_level_1
bob,9
bella,10
blue,10
suzy,8
gunter,6
joe,9
holly,3
coco,4
elliot,10


# Now join with df using name as join column

In [22]:
df.set_index('name', inplace=True)

df_list = [df, df5]
df6 = pd.concat(df_list, axis=1, join='inner')
df6

Unnamed: 0_level_0,species,age,weight,color,score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bob,dog,1,10,brown,9
bella,cat,6,5,black,10
blue,velociraptor,70,15,blue,10
suzy,dog,8,7,black,8
gunter,penguin,3,4,black,6
joe,squid,1,1,gray,9
holly,cat,2,3,white,3
coco,cat,13,2,orange,4
elliot,horse,3,380,white,10


# Use append to append df2 to df1

In [24]:
print(df1)
print(df2)

    name       species  age
0    bob           dog    1
1  bella           cat    6
2   blue  velociraptor   70
     name  species  age
0    suzy      dog    8
1  gunter  penguin    3
2     joe    squid    1


In [26]:
df7=df1.append(df2)
print(df7)

     name       species  age
0     bob           dog    1
1   bella           cat    6
2    blue  velociraptor   70
0    suzy           dog    8
1  gunter       penguin    3
2     joe         squid    1


# Append a row

In [28]:
df8=df7.append(pd.Series(['oliver', 'monkey', 13], index=['name', 'species', 'age']), ignore_index=True) 
df8

Unnamed: 0,name,species,age
0,bob,dog,1
1,bella,cat,6
2,blue,velociraptor,70
3,suzy,dog,8
4,gunter,penguin,3
5,joe,squid,1
6,oliver,monkey,13


# now use merge to join a new datagrame with score to the original df

In [29]:
df10 = pd.DataFrame( {'name': names[1:],
                     'score': [10,10,8,6,9,3,4,10]})
print(df10)

print(df)

     name  score
0   bella     10
1    blue     10
2    suzy      8
3  gunter      6
4     joe      9
5   holly      3
6    coco      4
7  elliot     10
             species  age  weight   color
name                                     
bob              dog    1      10   brown
bella            cat    6       5   black
blue    velociraptor   70      15    blue
suzy             dog    8       7   black
gunter       penguin    3       4   black
joe            squid    1       1    gray
holly            cat    2       3   white
coco             cat   13       2  orange
elliot         horse    3     380   white


# Use function merge to join the two into a new dataframe with df and score

In [31]:
merged_df = pd.merge(df, df10, on='name')
merged_df

Unnamed: 0,name,species,age,weight,color,score
0,bella,cat,6,5,black,10
1,blue,velociraptor,70,15,blue,10
2,suzy,dog,8,7,black,8
3,gunter,penguin,3,4,black,6
4,joe,squid,1,1,gray,9
5,holly,cat,2,3,white,3
6,coco,cat,13,2,orange,4
7,elliot,horse,3,380,white,10


# Now create a smaller dataframe and try the righ and left joins

In [32]:
df11 = pd.DataFrame( {'name': names[3:],
                     'score': [8,6,9,3,4,10]})
print(df11)

print(df)

     name  score
0    suzy      8
1  gunter      6
2     joe      9
3   holly      3
4    coco      4
5  elliot     10
             species  age  weight   color
name                                     
bob              dog    1      10   brown
bella            cat    6       5   black
blue    velociraptor   70      15    blue
suzy             dog    8       7   black
gunter       penguin    3       4   black
joe            squid    1       1    gray
holly            cat    2       3   white
coco             cat   13       2  orange
elliot         horse    3     380   white


In [33]:
merged_df2 = pd.merge(df11, df, how='right', on='name')
merged_df2

Unnamed: 0,name,score,species,age,weight,color
0,bob,,dog,1,10,brown
1,bella,,cat,6,5,black
2,blue,,velociraptor,70,15,blue
3,suzy,8.0,dog,8,7,black
4,gunter,6.0,penguin,3,4,black
5,joe,9.0,squid,1,1,gray
6,holly,3.0,cat,2,3,white
7,coco,4.0,cat,13,2,orange
8,elliot,10.0,horse,3,380,white


# You can also specify the index to be name, then use it in joins

In [38]:
#df.set_index('name', inplace=True)
df11.set_index('name', inplace=True)
print(df)
print(df11)

             species  age  weight   color
name                                     
bob              dog    1      10   brown
bella            cat    6       5   black
blue    velociraptor   70      15    blue
suzy             dog    8       7   black
gunter       penguin    3       4   black
joe            squid    1       1    gray
holly            cat    2       3   white
coco             cat   13       2  orange
elliot         horse    3     380   white
        score
name         
suzy        8
gunter      6
joe         9
holly       3
coco        4
elliot     10


In [42]:
print(pd.merge(df, df11, on='name'))

        species  age  weight   color  score
name                                       
suzy        dog    8       7   black      8
gunter  penguin    3       4   black      6
joe       squid    1       1    gray      9
holly       cat    2       3   white      3
coco        cat   13       2  orange      4
elliot    horse    3     380   white     10


In [43]:
pd.merge(df, df11, how='inner', left_index=True, right_index=True)

Unnamed: 0_level_0,species,age,weight,color,score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
suzy,dog,8,7,black,8
gunter,penguin,3,4,black,6
joe,squid,1,1,gray,9
holly,cat,2,3,white,3
coco,cat,13,2,orange,4
elliot,horse,3,380,white,10


In [44]:
pd.merge(df, df11, how='outer', left_index=True, right_index=True)

Unnamed: 0_level_0,species,age,weight,color,score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bella,cat,6,5,black,
blue,velociraptor,70,15,blue,
bob,dog,1,10,brown,
coco,cat,13,2,orange,4.0
elliot,horse,3,380,white,10.0
gunter,penguin,3,4,black,6.0
holly,cat,2,3,white,3.0
joe,squid,1,1,gray,9.0
suzy,dog,8,7,black,8.0
