In [2]:
import pandas as pd
import numpy as np

I think this lecture will answer my previous question about how to add a column to a dataframe. Aparently you can use concatenation to "glue" two dataframes together. This reminds me a lot of joining tables and relating tables in ArcGIS because it's done using the indexes

In [3]:
data_one = {'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3']}
data_two = {'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}
one = pd.DataFrame(data_one)
two = pd.DataFrame(data_two)
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [4]:
two

Unnamed: 0,C,D
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


I can go ahead and guess that the best move is going to be to combine based on the indexes, with C, D becoming new columns

I can already see how if you have columns with the same name, like C in both tables, how you might have a tricky time of combining, and you might have to find a way to delete duplicates, sort, etc. but I assume there are calls for that

In [5]:
pd.concat([one, two], axis = 1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


This ^ is combining based on the indexes (axis = 1)

In [6]:
pd.concat([one, two], axis = 0)

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,A3,B3,,
0,,,C0,D0
1,,,C1,D1
2,,,C2,D2
3,,,C3,D3


Doing it the other way does produce a result, but it contains a lot of holes because it adds a whole new set of numbered indexes with the same names, because there can only be one (A, 0), hence the newly created value with the same location has to be made into a NaN

If for some reason you want to do it this way, you should just rename the columns:

This is assuming that you know that A is equal to C and B is equal to D, i.e. that they represent the same thing, and you can therefore just insert the information lower in the column

In [7]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [8]:
two

Unnamed: 0,C,D
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [9]:
two.columns = one.columns

In [10]:
two

Unnamed: 0,A,B
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


Now, the column names for two have been renamed to match one

In [11]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [12]:
two

Unnamed: 0,A,B
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


The naming is permanent, which is nice

In [13]:
pd.concat([one, two], axis = 0)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


With this call though, you have to actually set it to a new variable for it to be permanent, i.e.

In [14]:
myjoin = pd.concat([one, two], axis = 0)

In [15]:
myjoin

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


I tried to use the name "myconcat" but it doesn't work, so I guess don't try to use "concat" in the name of things

In [16]:
myjoin.index

Int64Index([0, 1, 2, 3, 0, 1, 2, 3], dtype='int64')

In [17]:
myjoin.index = range(len(myjoin))

In [18]:
myjoin

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,C0,D0
5,C1,D1
6,C2,D2
7,C3,D3


That was a wildly fast and convenient way of resetting the index so it covers the entire range, kinda like autofilling a list in Excel

On column: Should be present in both tables, should not contain any repeating values within either original table. It should represent the same thing in both tables.

In [19]:
reg_data = {'reg_id' : ['1', '2', '3', '4'], 'name' : ['Andrew', 'Bob', 'Charlie', 'David']}
#Creating a new dataframe called reg
reg = pd.DataFrame(reg_data)
reg['name']

0     Andrew
1        Bob
2    Charlie
3      David
Name: name, dtype: object

In [20]:
reg

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bob
2,3,Charlie
3,4,David


In [21]:
log_data = {'log_id' : ['1', '2', '3', '4'], 'name' : ['Xavier', 'Andrew', 'Yolanda', 'Bob']}
#Creating a new dataframe called logins
log = pd.DataFrame(log_data)
log['name']

0     Xavier
1     Andrew
2    Yolanda
3        Bob
Name: name, dtype: object

In [22]:
log

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bob


In [23]:
pd.merge(reg, log, how = 'inner', on = 'name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bob,4


Only Andrew and Bob are present in both tables, so they are shown here. This is the only fully complete data in the set

------------------

In [24]:
registrations = pd.DataFrame({'reg_id':[1,2,3,4],'name':['Andrew','Bobo','Claire','David']})
logins = pd.DataFrame({'log_id':[1,2,3,4],'name':['Xavier','Andrew','Yolanda','Bobo']})

In [25]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bobo
2,3,Claire
3,4,David


In [26]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [27]:
pd.merge(registrations, logins, how = 'inner', on = 'name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bobo,4


^ This only adds things to the new table which are present in both tables

In [28]:
pd.merge(registrations, logins, how = 'outer', on = 'name')

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bobo,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


^ This combines everything.
Basically, inner is an 'and' and outer is an 'or', i.e. it's in A & B or A | B

_______________

Left and right merging:

The first table passed in is the left, the second is the right.

In [29]:
pd.merge(registrations, logins, how = 'left', on = 'name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bobo,4.0
2,3,Claire,
3,4,David,


In [30]:
pd.merge(registrations, logins, how = 'right', on = 'name')

Unnamed: 0,reg_id,name,log_id
0,,Xavier,1
1,1.0,Andrew,2
2,,Yolanda,3
3,2.0,Bobo,4


In [31]:
pd.merge(left = registrations, right = logins, how = 'left', on = 'name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bobo,4.0
2,3,Claire,
3,4,David,


As shown by these examples, you don't have to specify left = "item", you can simply pass in the tables as (left, right 

but it might help with clarity to say left = "item" etc

------------------

Outer merge option:

Setting how = 'outer' allows you to grab all names from both tables, even if there will be NaN values in the resulting joined table

In [32]:
pd.merge(registrations, logins, how = 'outer', on = 'name')

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bobo,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


Everything is included, including NaN. The order of (registrations, logins 

only affects the final sorting of the table in inner/outer merge

---------

Joining on an index instead of a column

In [33]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bobo
2,3,Claire
3,4,David


In [34]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


Note: In the lecture, he has registrations with name as the index and no numbered index

In [36]:
registrations = registrations.set_index('name')

In [37]:
registrations

Unnamed: 0_level_0,reg_id
name,Unnamed: 1_level_1
Andrew,1
Bobo,2
Claire,3
David,4


There we go, now it matches what he has

In [38]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [41]:
pd.merge(registrations, logins, left_index = True, right_on = 'name', how = 'inner')

Unnamed: 0,reg_id,log_id,name
1,1,2,Andrew
3,2,4,Bobo


The 'name' index has turned into a column, and the index from the log_id table has been retained

In [43]:
registrations = registrations.reset_index()

In [44]:
registrations

Unnamed: 0,name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [45]:
registrations.columns = ['reg_name', 'reg_id']

In [46]:
registrations

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [47]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [48]:
pd.merge(registrations, logins, how = 'inner', left_on = 'reg_name', right_on = 'name')

Unnamed: 0,reg_name,reg_id,log_id,name
0,Andrew,1,2,Andrew
1,Bobo,2,4,Bobo


Because there's two columns representing the same thing, it would be a good idea to drop the unneeded column:

In [49]:
results = pd.merge(registrations, logins, how = 'inner', left_on = 'reg_name', right_on = 'name')
results = results.pop('reg_name')

0    Andrew
1      Bobo
Name: reg_name, dtype: object

In [50]:
results

Unnamed: 0,reg_id,log_id,name
0,1,2,Andrew
1,2,4,Bobo


another way to do this is results.drop('reg_name', axis = 1).

This is the method my instructor used, so maybe it uses less memory

In [51]:
registrations

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [52]:
registrations.columns = ['name', 'id']

In [53]:
logins.columns = ['id', 'name']

In [54]:
registrations

Unnamed: 0,name,id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [55]:
logins

Unnamed: 0,id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [57]:
pd.merge(registrations, logins, how = 'outer', on = 'name')

Unnamed: 0,name,id_x,id_y
0,Andrew,1.0,2.0
1,Bobo,2.0,4.0
2,Claire,3.0,
3,David,4.0,
4,Xavier,,1.0
5,Yolanda,,3.0


Pandas automatically tags the duplicate-named columns with a suffix, x & y, etc. 

In [59]:
pd.merge(registrations, logins, how = 'outer', on = 'name', suffixes = ('_reg', '_log'))

Unnamed: 0,name,id_reg,id_log
0,Andrew,1.0,2.0
1,Bobo,2.0,4.0
2,Claire,3.0,
3,David,4.0,
4,Xavier,,1.0
5,Yolanda,,3.0


The chosen suffixes are now included.