In [1]:
import pandas as pd
import numpy as np

In [None]:
### categories of joins

In [7]:
df1 = pd.DataFrame(
    {
        "employee": ["Bob", "Jake", "Lisa", "Sue"],
        "group": ["Accounting", "Engineering", "Engineering", "HR"],
    }
)
df5 = pd.DataFrame(
    {
        "group": ["Accounting", "Engineering", "Engineering", "HR"],
        "skills": ["math", "coding", "linux", "spreadsheets"],
    }
)

In [9]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [10]:
df5

Unnamed: 0,group,skills
0,Accounting,math
1,Engineering,coding
2,Engineering,linux
3,HR,spreadsheets


In [11]:
print(pd.merge(df1, df5))

  employee        group        skills
0      Bob   Accounting          math
1     Jake  Engineering        coding
2     Jake  Engineering         linux
3     Lisa  Engineering        coding
4     Lisa  Engineering         linux
5      Sue           HR  spreadsheets


In [None]:
### key spec

In [12]:
print(pd.merge(df1, df5, on="group"))

  employee        group        skills
0      Bob   Accounting          math
1     Jake  Engineering        coding
2     Jake  Engineering         linux
3     Lisa  Engineering        coding
4     Lisa  Engineering         linux
5      Sue           HR  spreadsheets


In [13]:
# left_on and right_on
df3 = pd.DataFrame(
    {"name": ["Bob", "Jake", "Lisa", "Sue"], "salary": [70000, 80000, 120000, 90000]}
)
print(pd.merge(df1, df3, left_on="employee", right_on="name"))

  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000


In [15]:
print(
    pd.merge(df1, df3, left_on="employee", right_on="name").drop("name", axis="columns")
)

  employee        group  salary
0      Bob   Accounting   70000
1     Jake  Engineering   80000
2     Lisa  Engineering  120000
3      Sue           HR   90000


In [18]:
df2 = pd.DataFrame(
    {"employee": ["Lisa", "Bob", "Jake", "Sue"], "hire_date": [2004, 2008, 2012, 2014]}
)

# left_index and right_index
df1a = df1.set_index("employee")
df2a = df2.set_index("employee")

print(pd.merge(df1a, df2a, left_index=True, right_index=True))

                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


In [19]:
print(df1a.join(df2a))

                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


In [20]:
print(pd.merge(df1a, df3, left_index=True, right_on="name"))

         group  name  salary
0   Accounting   Bob   70000
1  Engineering  Jake   80000
2  Engineering  Lisa  120000
3           HR   Sue   90000


In [None]:
### set arithmetic for joins

In [21]:
df6 = pd.DataFrame(
    {"name": ["Peter", "Paul", "Mary"], "food": ["fish", "beans", "bread"]},
    columns=["name", "food"],
)
df7 = pd.DataFrame(
    {"name": ["Mary", "Joseph"], "drink": ["wine", "beer"]}, columns=["name", "drink"]
)

print(pd.merge(df6, df7, how="inner"))

   name   food drink
0  Mary  bread  wine


In [22]:
print(pd.merge(df6, df7, how="outer"))

     name   food drink
0  Joseph    NaN  beer
1    Mary  bread  wine
2    Paul  beans   NaN
3   Peter   fish   NaN


In [23]:
print(pd.merge(df6, df7, how="left"))

    name   food drink
0  Peter   fish   NaN
1   Paul  beans   NaN
2   Mary  bread  wine


In [None]:
### suffix

In [24]:
df8 = pd.DataFrame({"name": ["Bob", "Jake", "Lisa", "Sue"], "rank": [1, 2, 3, 4]})
df9 = pd.DataFrame({"name": ["Bob", "Jake", "Lisa", "Sue"], "rank": [3, 1, 4, 2]})

print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))

   name  rank_L  rank_R
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2


In [None]:
### us states

In [27]:
pop = pd.read_csv("state-population.csv")
areas = pd.read_csv("state-areas.csv")
abbrevs = pd.read_csv("state-abbrevs.csv")

print(pop.head())
print(areas.head())
print(abbrevs.head())

  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA


In [28]:
merged = pd.merge(
    pop, abbrevs, how="outer", left_on="state/region", right_on="abbreviation"
)
merged = merged.drop("abbreviation", axis=1)
print(merged.head())

  state/region     ages  year  population   state
0           AK    total  1990    553290.0  Alaska
1           AK  under18  1990    177502.0  Alaska
2           AK    total  1992    588736.0  Alaska
3           AK  under18  1991    182180.0  Alaska
4           AK  under18  1992    184878.0  Alaska


In [29]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [30]:
merged[merged["population"].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
1872,PR,under18,1990,,
1873,PR,total,1990,,
1874,PR,total,1991,,
1875,PR,under18,1991,,
1876,PR,total,1993,,


In [31]:
merged.loc[merged["state"].isnull(), "state/region"].unique()

array(['PR', 'USA'], dtype=object)

In [32]:
merged.loc[merged["state/region"] == "PR", "state"] = "Puerto Rico"
merged.loc[merged["state/region"] == "USA", "state"] = "United States"
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [33]:
final = pd.merge(merged, areas, on="state", how="left")
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [34]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [35]:
final["state"][final["area (sq. mi)"].isnull()].unique()

array(['United States'], dtype=object)

In [36]:
final.dropna(inplace=True)
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [37]:
# calc
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.set_index("state", inplace=True)
density = data2010["population"] / data2010["area (sq. mi)"]

density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [38]:
density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64