[Reference](https://medium.com/@akaivdo/pandas-data-combination-1-merge-81fbdf506c85)

In [7]:
import pandas as pd

In [8]:
df_scores = pd.DataFrame({
    "id": ["1003", "1004", "1005", "1006", "1007"],
    "fname": ["Mike", "Tom", "Mary", "Bob", "Kevin"],
    "lname": ["Brown", "Davis", "Clark", "Lopez", "Wilson"],
    "gender": ["M", "M", "F", "M", "M"],
    "subject": ["Math", "Art", "Engilish", "Physics", "Music"],
    "score": [85, 78, 88, 90, 80]
})
df_scores

Unnamed: 0,id,fname,lname,gender,subject,score
0,1003,Mike,Brown,M,Math,85
1,1004,Tom,Davis,M,Art,78
2,1005,Mary,Clark,F,Engilish,88
3,1006,Bob,Lopez,M,Physics,90
4,1007,Kevin,Wilson,M,Music,80


In [9]:
import pandas as pd
df_students = pd.DataFrame({
    "id": ["1001", "1002", "1003", "1004", "1005"],
    "first_name": ["John", "Sarah", "Mike", "Tom", "Mary"],
    "last_name": ["Doe", "Smith", "Brown", "Davis", "Clark"],
    "gender": ["M", "F", "M", "M", "F"]
})
df_students

Unnamed: 0,id,first_name,last_name,gender
0,1001,John,Doe,M
1,1002,Sarah,Smith,F
2,1003,Mike,Brown,M
3,1004,Tom,Davis,M
4,1005,Mary,Clark,F


# JOIN
## INNER JOIN


In [10]:
pd.merge(df_scores, df_students, on="id")

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
0,1003,Mike,Brown,M,Math,85,Mike,Brown,M
1,1004,Tom,Davis,M,Art,78,Tom,Davis,M
2,1005,Mary,Clark,F,Engilish,88,Mary,Clark,F


In [11]:
# OR
pd.merge(df_scores, df_students, how='inner', on="id")

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
0,1003,Mike,Brown,M,Math,85,Mike,Brown,M
1,1004,Tom,Davis,M,Art,78,Tom,Davis,M
2,1005,Mary,Clark,F,Engilish,88,Mary,Clark,F


## LEFT JOIN


In [12]:
pd.merge(df_scores, df_students, how='left', on="id")

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
0,1003,Mike,Brown,M,Math,85,Mike,Brown,M
1,1004,Tom,Davis,M,Art,78,Tom,Davis,M
2,1005,Mary,Clark,F,Engilish,88,Mary,Clark,F
3,1006,Bob,Lopez,M,Physics,90,,,
4,1007,Kevin,Wilson,M,Music,80,,,


## RIGHT JOIN


In [13]:
pd.merge(df_scores, df_students, how='right', on="id")

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
0,1001,,,,,,John,Doe,M
1,1002,,,,,,Sarah,Smith,F
2,1003,Mike,Brown,M,Math,85.0,Mike,Brown,M
3,1004,Tom,Davis,M,Art,78.0,Tom,Davis,M
4,1005,Mary,Clark,F,Engilish,88.0,Mary,Clark,F


## OUTER JOIN


In [14]:
pd.merge(df_scores, df_students, how='outer', on="id")

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
0,1003,Mike,Brown,M,Math,85.0,Mike,Brown,M
1,1004,Tom,Davis,M,Art,78.0,Tom,Davis,M
2,1005,Mary,Clark,F,Engilish,88.0,Mary,Clark,F
3,1006,Bob,Lopez,M,Physics,90.0,,,
4,1007,Kevin,Wilson,M,Music,80.0,,,
5,1001,,,,,,John,Doe,M
6,1002,,,,,,Sarah,Smith,F


## CROSS JOIN


In [15]:
df_left = pd.DataFrame(["left1", "left2", "left3"], columns=["col_1"])
df_left

Unnamed: 0,col_1
0,left1
1,left2
2,left3


In [16]:
df_right = pd.DataFrame(["right1", "right2"], columns=["col_1"])
df_right

Unnamed: 0,col_1
0,right1
1,right2


In [17]:
pd.merge(df_left, df_right, how='cross')

Unnamed: 0,col_1_x,col_1_y
0,left1,right1
1,left1,right2
2,left2,right1
3,left2,right2
4,left3,right1
5,left3,right2


## LEFT EXCLUSIVE/ANTI JOIN


In [18]:
pd.merge(df_scores, df_students, how='outer', on="id", indicator=True)

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y,_merge
0,1003,Mike,Brown,M,Math,85.0,Mike,Brown,M,both
1,1004,Tom,Davis,M,Art,78.0,Tom,Davis,M,both
2,1005,Mary,Clark,F,Engilish,88.0,Mary,Clark,F,both
3,1006,Bob,Lopez,M,Physics,90.0,,,,left_only
4,1007,Kevin,Wilson,M,Music,80.0,,,,left_only
5,1001,,,,,,John,Doe,M,right_only
6,1002,,,,,,Sarah,Smith,F,right_only


In [19]:
pd.merge(df_scores, df_students, how='outer', on="id", indicator=True) \
    .query("_merge == 'left_only'")

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y,_merge
3,1006,Bob,Lopez,M,Physics,90.0,,,,left_only
4,1007,Kevin,Wilson,M,Music,80.0,,,,left_only


In [20]:
pd.merge(df_scores, df_students, how='outer', on="id", indicator=True) \
    .query("_merge == 'left_only'") \
    .drop("_merge", axis=1)

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
3,1006,Bob,Lopez,M,Physics,90.0,,,
4,1007,Kevin,Wilson,M,Music,80.0,,,


## RIGHT EXCLUSIVE/ANTI JOIN


In [21]:
pd.merge(df_scores, df_students, how='outer', on="id", indicator=True) \
    .query("_merge == 'right_only'") \
    .drop("_merge", axis=1)

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
5,1001,,,,,,John,Doe,M
6,1002,,,,,,Sarah,Smith,F


## FULL OUTER EXCLUSIVE/ANTI JOIN


In [22]:
pd.merge(df_scores, df_students, how='outer', on="id", indicator=True) \
    .query("_merge == 'left_only' or _merge == 'right_only'") \
    .drop("_merge", axis=1)

Unnamed: 0,id,fname,lname,gender_x,subject,score,first_name,last_name,gender_y
3,1006,Bob,Lopez,M,Physics,90.0,,,
4,1007,Kevin,Wilson,M,Music,80.0,,,
5,1001,,,,,,John,Doe,M
6,1002,,,,,,Sarah,Smith,F


## JOIN by multiple columns


In [23]:
pd.merge(df_scores, df_students, how='inner'
    , left_on=["fname", "lname"]
    , right_on=["first_name", "last_name"])

Unnamed: 0,id_x,fname,lname,gender_x,subject,score,id_y,first_name,last_name,gender_y
0,1003,Mike,Brown,M,Math,85,1003,Mike,Brown,M
1,1004,Tom,Davis,M,Art,78,1004,Tom,Davis,M
2,1005,Mary,Clark,F,Engilish,88,1005,Mary,Clark,F


## Removing duplicate columns


In [24]:
# Get the columns that only exist on the right side
target_cols = list(df_students.columns.difference(df_scores.columns))
# Append the key columns
target_cols.append("id")
target_cols

['first_name', 'last_name', 'id']

In [25]:
# We only use the above columns of the DataFrame on the right side (df_students) 
pd.merge(df_scores, df_students[target_cols], how='inner', on="id")

Unnamed: 0,id,fname,lname,gender,subject,score,first_name,last_name
0,1003,Mike,Brown,M,Math,85,Mike,Brown
1,1004,Tom,Davis,M,Art,78,Tom,Davis
2,1005,Mary,Clark,F,Engilish,88,Mary,Clark


## Specify suffixes for columns with the same name


In [26]:
pd.merge(df_scores, df_students, how='inner', on="id", suffixes=["_left", "_right"])

Unnamed: 0,id,fname,lname,gender_left,subject,score,first_name,last_name,gender_right
0,1003,Mike,Brown,M,Math,85,Mike,Brown,M
1,1004,Tom,Davis,M,Art,78,Tom,Davis,M
2,1005,Mary,Clark,F,Engilish,88,Mary,Clark,F
