# Pandas Merges
<mark>A little help with merging 2 pandas DataFrames (useful for Project 1)<br>
See McKinney, section 8.2<br>

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

In [2]:
#the following code's purpose is to display 2 dataframes side by side
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    spacer='&nbsp&nbsp&nbsp&nbsp'
    for df in args:
        html_str+=spacer+df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)



![](./media/joins.png)



## Create Dataframes


In [3]:
x = pd.DataFrame({'xkey': ['a', 'a', 'a', 'b', 'b', 'c' ], 'xdata': range(6)})

y = pd.DataFrame({'ykey':['a', 'b', 'd'],
               'ydata': range(10,13)})

In [4]:
# lets see what overlaps
xk = x.loc[:, 'xkey']
yk = y.loc[:, 'ykey']

#append 1 to the other
xyk = xk.append(yk).reset_index(drop=True)

#how many unique values are there if they are combined
print(f' unique key values in combined x and y is {xyk.unique()}')

print(f' unique key values present in both x and y is {set(yk).intersection(set(xk))}')
display_side_by_side(x, y)

 unique key values in combined x and y is ['a' 'b' 'c' 'd']
 unique key values present in both x and y is {'b', 'a'}


Unnamed: 0,xkey,xdata
0,a,0
1,a,1
2,a,2
3,b,3
4,b,4
5,c,5

Unnamed: 0,ykey,ydata
0,a,10
1,b,11
2,d,12


## An Inner Join
return  the intersection, or <u>all the rows in x and y that have matching keys.</u><br>

In [5]:
ndf1=pd.merge(x, y, left_on='xkey', right_on='ykey', how='inner')
display_side_by_side(ndf1, x, y)

Unnamed: 0,xkey,xdata,ykey,ydata
0,a,0,a,10
1,a,1,a,10
2,a,2,a,10
3,b,3,b,11
4,b,4,b,11

Unnamed: 0,xkey,xdata
0,a,0
1,a,1
2,a,2
3,b,3
4,b,4
5,c,5

Unnamed: 0,ykey,ydata
0,a,10
1,b,11
2,d,12


## An Outer Join
return  the union, or all the rows in x and y.  If a value is missing in either x or y NaNs are insertedleft

In [148]:
ndf2=pd.merge(x, y, left_on='xkey', right_on='ykey', how='outer')
display_side_by_side(ndf2, x, y)

Unnamed: 0,xkey,xdata,ykey,ydata
0,a,0.0,a,10.0
1,a,1.0,a,10.0
2,a,2.0,a,10.0
3,b,3.0,b,11.0
4,b,4.0,b,11.0
5,c,5.0,,
6,,,d,12.0

Unnamed: 0,xkey,xdata
0,a,0
1,a,1
2,a,2
3,b,3
4,b,4
5,c,5

Unnamed: 0,ykey,ydata
0,a,10
1,b,11
2,d,12


## A Left Join
return df using all keys found in left table<br>
If left table does not have a coresponding match in right table then fill in with NaN<br>

In [151]:
ndf3 = pd.merge(x, y, left_on='xkey', right_on='ykey', how='left')
display_side_by_side(ndf3, x, y)

Unnamed: 0,xkey,xdata,ykey,ydata
0,a,0,,
1,a,1,,
2,a,2,,
3,b,3,b,10.0
4,b,3,b,11.0
5,b,3,b,12.0
6,b,4,b,10.0
7,b,4,b,11.0
8,b,4,b,12.0
9,c,5,,

Unnamed: 0,xkey,xdata
0,a,0
1,a,1
2,a,2
3,b,3
4,b,4
5,c,5

Unnamed: 0,ykey,ydata
0,b,10
1,b,11
2,b,12


## A Right Join
return df using all keys found in right table<br>
If right table does not have a coresponding match in left table then fill in with NaN
Result is same length as left DataFrame

In [143]:
ndf4 = pd.merge(x, y, left_on='xkey', right_on='ykey', how='right')
display_side_by_side(ndf4, x, y)

Unnamed: 0,xkey,xdata,ykey,ydata
0,a,0.0,a,10
1,a,1.0,a,10
2,a,2.0,a,10
3,b,3.0,b,11
4,b,4.0,b,11
5,,,d,12

Unnamed: 0,xkey,xdata
0,a,0
1,a,1
2,a,2
3,b,3
4,b,4
5,c,5

Unnamed: 0,ykey,ydata
0,a,10
1,b,11
2,d,12


## Note what happens when you have duplicates in the column you are joining on
joins are many to many so if you have duplicates in both columns<br>
say 2 b's in x and 3 b's in y the result of an inner join will be the cartesian product of the 2.  Or 2x3=6 values

In [150]:
y = pd.DataFrame({'ykey':['b', 'b', 'b'],
               'ydata': range(10,13)})