In [2]:
import pandas as pd

## Let's see an example of where we use the merge function

In [10]:
staff_df = pd.DataFrame([{"Name": "Francisco", "Role": "Student"},
                        {"Name": "Javier", "Role": "Teacher"},
                        {"Name": "Jonathan", "Role": "Scholar"},
                        {"Name": "Kristina", "Role": "Graduate"},])

staff_df = staff_df.set_index("Name")
staff_df

Unnamed: 0_level_0,Role
Name,Unnamed: 1_level_1
Francisco,Student
Javier,Teacher
Jonathan,Scholar
Kristina,Graduate


In [11]:
student_df = pd.DataFrame([{"Name": "Kristina", "School": "Graphic Designer"},
                          {"Name": "Jonathan", "School": "Computer Science"},
                          {"Name": "Alex", "School": "Law"},
                          {"Name": "Francisco", "School": "Physics"},
                          {"Name": "Sally", "School": "Engineering"}])

student_df = student_df.set_index("Name")
student_df

Unnamed: 0_level_0,School
Name,Unnamed: 1_level_1
Kristina,Graphic Designer
Jonathan,Computer Science
Alex,Law
Francisco,Physics
Sally,Engineering


In [12]:
print(staff_df, "\n")
print(student_df)

               Role
Name               
Francisco   Student
Javier      Teacher
Jonathan    Scholar
Kristina   Graduate 

                     School
Name                       
Kristina   Graphic Designer
Jonathan   Computer Science
Alex                    Law
Francisco           Physics
Sally           Engineering


## Union of these DataFrames:

In [13]:
pd.merge(staff_df, student_df, how="outer", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alex,,Law
Francisco,Student,Physics
Javier,Teacher,
Jonathan,Scholar,Computer Science
Kristina,Graduate,Graphic Designer
Sally,,Engineering


## Intersection of these DataFrames:

In [14]:
pd.merge(staff_df, student_df, how="inner", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Francisco,Student,Physics
Jonathan,Scholar,Computer Science
Kristina,Graduate,Graphic Designer


## Set adition (Order is important)

In [15]:
pd.merge(staff_df, student_df, how="left", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Francisco,Student,Physics
Javier,Teacher,
Jonathan,Scholar,Computer Science
Kristina,Graduate,Graphic Designer


In [16]:
pd.merge(staff_df, student_df, how="right", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kristina,Graduate,Graphic Designer
Jonathan,Scholar,Computer Science
Alex,,Law
Francisco,Student,Physics
Sally,,Engineering


## You don't need to use indeces to join on, instead use columns

In [17]:
# First, let's remove our index from both of our DataFrames:
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()

In [18]:
# Merge using the "on" parameter
pd.merge(staff_df, student_df, how="right", on="Name")

Unnamed: 0,Name,Role,School
0,Francisco,Student,Physics
1,Jonathan,Scholar,Computer Science
2,Kristina,Graduate,Graphic Designer
3,Alex,,Law
4,Sally,,Engineering


## What happens when we have conflicts between the DataFrames?

In [22]:
staff_df = pd.DataFrame([{"Name": "Francisco", "Role": "Student", "Location": "MEX"},
                        {"Name": "Javier", "Role": "Teacher", "Location": "USA"},
                        {"Name": "Jonathan", "Role": "Scholar", "Location": "RUS"},
                        {"Name": "Kristina", "Role": "Graduate", "Location": "MEX"}])

staff_df = staff_df.set_index("Name")
staff_df

Unnamed: 0_level_0,Role,Location
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Francisco,Student,MEX
Javier,Teacher,USA
Jonathan,Scholar,RUS
Kristina,Graduate,MEX


In [23]:
student_df = pd.DataFrame([{"Name": "Kristina", "School": "Graphic Designer", "Location": "USA"},
                          {"Name": "Jonathan", "School": "Computer Science", "Location": "RUS"},
                          {"Name": "Alex", "School": "Law", "Location": "MEX"},
                          {"Name": "Francisco", "School": "Physics", "Location": "USA"},
                          {"Name": "Sally", "School": "Engineering", "Location": "SWE"}])

student_df = student_df.set_index("Name")
student_df

Unnamed: 0_level_0,School,Location
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kristina,Graphic Designer,USA
Jonathan,Computer Science,RUS
Alex,Law,MEX
Francisco,Physics,USA
Sally,Engineering,SWE


In [25]:
# Location_x refers to the left DataFrame and Location_y refers to the right DataFrame
pd.merge(staff_df, student_df, how="left", on="Name")

Unnamed: 0_level_0,Role,Location_x,School,Location_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Francisco,Student,MEX,Physics,USA
Javier,Teacher,USA,,
Jonathan,Scholar,RUS,Computer Science,RUS
Kristina,Graduate,MEX,Graphic Designer,USA


## Multi-indexing and Multiple columns

In [35]:
staff_df = pd.DataFrame([{"First Name": "Francisco", "Last Name": "Morales", 
                          "Role": "Student", "Location": "MEX"},
                        {"First Name": "Javier", "Last Name": "Murguía", 
                         "Role": "Teacher", "Location": "USA"},
                        {"First Name": "Jonathan", "Last Name": "Morales", 
                         "Role": "Scholar", "Location": "RUS"},
                        {"First Name": "Kristina", "Last Name": "Guerrero", 
                         "Role": "Graduate", "Location": "MEX"}])

staff_df

Unnamed: 0,First Name,Last Name,Role,Location
0,Francisco,Morales,Student,MEX
1,Javier,Murguía,Teacher,USA
2,Jonathan,Morales,Scholar,RUS
3,Kristina,Guerrero,Graduate,MEX


In [36]:
student_df = pd.DataFrame([{"First Name": "Kristina", "Last Name": "Guerrero",
                            "School": "Graphic Designer", "Location": "USA"},
                          {"First Name": "Jonathan", "Last Name": "Murguía",
                           "School": "Computer Science", "Location": "RUS"},
                          {"First Name": "Alex", "Last Name": "Brooks",
                           "School": "Law", "Location": "MEX"},
                          {"First Name": "Francisco", "Last Name": "Morales",
                           "School": "Physics", "Location": "USA"},
                          {"First Name": "Sally", "Last Name": "Wilde",
                           "School": "Engineering", "Location": "SWE"}])

student_df

Unnamed: 0,First Name,Last Name,School,Location
0,Kristina,Guerrero,Graphic Designer,USA
1,Jonathan,Murguía,Computer Science,RUS
2,Alex,Brooks,Law,MEX
3,Francisco,Morales,Physics,USA
4,Sally,Wilde,Engineering,SWE


In [37]:
pd.merge(staff_df, student_df, how="inner", on=["First Name", "Last Name"])

Unnamed: 0,First Name,Last Name,Role,Location_x,School,Location_y
0,Francisco,Morales,Student,MEX,Physics,USA
1,Kristina,Guerrero,Graduate,MEX,Graphic Designer,USA


In [38]:
# To supress some of the jupyter warning mesages and just tell read_csv to ingonre bad lines, 
# we're going to start the cell with a cell magic called %%capture


In [41]:
%%capture

df_2011 = pd.read_csv("Datasets/MERGED2011_12_PP.csv", error_bad_lines=False)
df_2012 = pd.read_csv("Datasets/MERGED2012_13_PP.csv", error_bad_lines=False)
df_2013 = pd.read_csv("Datasets/MERGED2013_14_PP.csv", error_bad_lines=False)

In [42]:
df_2011.head()

Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,
3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,,...,,,,,,,,,,
4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,,...,,,,,,,,,,


In [48]:
#

frames = [df_2011, df_2012, df_2013]
concatenated = pd.concat(frames)
concatenated

Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,
3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,,...,,,,,,,,,,
4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799,48285703,157107,1571,Georgia Military College-Columbus Campus,Columbus,GA,31909,,,,...,,,,,,,,,,
7800,48285704,157101,1571,Georgia Military College-Valdosta Campus,Valdosta,GA,31605,,,,...,,,,,,,,,,
7801,48285705,157105,1571,Georgia Military College-Warner Robins Campus,Warner Robins,GA,31093,,,,...,,,,,,,,,,
7802,48285706,157100,1571,Georgia Military College-Online,Milledgeville,GA,31061,,,,...,,,,,,,,,,


In [49]:
len(df_2011) + len(df_2012) + len(df_2013)

23272

In [50]:
pd.concat(frames, keys=["2011", "2012", "2013"])

Unnamed: 0,Unnamed: 1,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
2011,0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
2011,1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2011,2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,
2011,3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,,...,,,,,,,,,,
2011,4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,7799,48285703,157107,1571,Georgia Military College-Columbus Campus,Columbus,GA,31909,,,,...,,,,,,,,,,
2013,7800,48285704,157101,1571,Georgia Military College-Valdosta Campus,Valdosta,GA,31605,,,,...,,,,,,,,,,
2013,7801,48285705,157105,1571,Georgia Military College-Warner Robins Campus,Warner Robins,GA,31093,,,,...,,,,,,,,,,
2013,7802,48285706,157100,1571,Georgia Military College-Online,Milledgeville,GA,31061,,,,...,,,,,,,,,,
