# 04. Manipulating the data


In [1]:
import pandas as pd
import numpy as np

## 02. Concatenating

In [2]:
df1 = pd.DataFrame(np.full((2,3),'x', dtype=object), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,x,x,x
1,x,x,x


In [3]:
df2 = pd.DataFrame(np.full((3,3),'o', dtype=object), columns=['A', 'B', 'C'])
df2 

Unnamed: 0,A,B,C
0,o,o,o
1,o,o,o
2,o,o,o


In [4]:
df3 = pd.DataFrame(np.full((2,2),'v', dtype=object), columns=['D', 'E'])
df3

Unnamed: 0,D,E
0,v,v
1,v,v


### Concatenation along axis 0

In [5]:
pd.concat([df1,df2])

Unnamed: 0,A,B,C
0,x,x,x
1,x,x,x
0,o,o,o
1,o,o,o
2,o,o,o


In [6]:
# reset index
pd.concat([df1,df2]).reset_index(drop=True)

Unnamed: 0,A,B,C
0,x,x,x
1,x,x,x
2,o,o,o
3,o,o,o
4,o,o,o


In [7]:
pd.concat([df1,df3])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,A,B,C,D,E
0,x,x,x,,
1,x,x,x,,
0,,,,v,v
1,,,,v,v


### The keys parameter

In [8]:
df4 = pd.concat([df1,df2], keys=['df1', 'df2'])
df4

Unnamed: 0,Unnamed: 1,A,B,C
df1,0,x,x,x
df1,1,x,x,x
df2,0,o,o,o
df2,1,o,o,o
df2,2,o,o,o


In [9]:
df4.loc['df2',:]

Unnamed: 0,A,B,C
0,o,o,o
1,o,o,o
2,o,o,o


### Concatenating along axis 1

In [10]:
pd.concat([df1, df3], axis=1)

Unnamed: 0,A,B,C,D,E
0,x,x,x,v,v
1,x,x,x,v,v


In [11]:
pd.concat([df1,df2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,x,x,x,o,o,o
1,x,x,x,o,o,o
2,,,,o,o,o


# The join parameter

In [13]:
pd.concat([df1,df2], axis=1, join='inner')

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,x,x,x,o,o,o
1,x,x,x,o,o,o


In [14]:
pd.concat([df1,df3], join='inner')

0
1
0
1


##  03. Merging and joining

### Merging on a single column

In [16]:
users = pd.DataFrame( {'userID': [5672, 3452, 2878, 3234],
                'First Name': ['Christopher', 'Johnnie', 'Debbie', 'Teri'],
                'Last Name': ['Boyd','Baldwin', 'Alvarez', 'Gill']})
users

Unnamed: 0,userID,First Name,Last Name
0,5672,Christopher,Boyd
1,3452,Johnnie,Baldwin
2,2878,Debbie,Alvarez
3,3234,Teri,Gill


In [17]:
scores = pd.DataFrame( {'userID': [2878, 5672, 3234, 5672, 2878],
                'Score': [84,56,72,77,88]})
scores


Unnamed: 0,userID,Score
0,2878,84
1,5672,56
2,3234,72
3,5672,77
4,2878,88


In [18]:
merged_df = pd.merge(users, scores) 
merged_df

Unnamed: 0,userID,First Name,Last Name,Score
0,5672,Christopher,Boyd,56
1,5672,Christopher,Boyd,77
2,2878,Debbie,Alvarez,84
3,2878,Debbie,Alvarez,88
4,3234,Teri,Gill,72


In [19]:
scores2 = pd.DataFrame( {'studentID': [2878, 5672, 3234, 5672, 2878],
                'Score': [84,56,72,77,88]})

In [21]:
pd.merge(users, scores2, left_on='userID', right_on='studentID')

Unnamed: 0,userID,First Name,Last Name,studentID,Score
0,5672,Christopher,Boyd,5672,56
1,5672,Christopher,Boyd,5672,77
2,2878,Debbie,Alvarez,2878,84
3,2878,Debbie,Alvarez,2878,88
4,3234,Teri,Gill,3234,72


In [None]:
pd.merge(users, scores2, left_on='userID', right_on='studentID')

### Merging on multiple columns

In [23]:
gold = pd.DataFrame({'Code': ['CAN', 'GER', 'USA', 'NOR'],
               'Country': ['Canada', 'Germany', 'United States', 'Norway'],
               'Total': [ 14, 10, 9, 9]})
gold

Unnamed: 0,Code,Country,Total
0,CAN,Canada,14
1,GER,Germany,10
2,USA,United States,9
3,NOR,Norway,9


In [24]:
bronze= pd.DataFrame({'Code': ['USA','GER', 'NOR', 'AUS'],
               'Country': ['United States', 'Germany', 'Norway', 'Austria'],
               'Total': [ 13, 7, 7, 6]})
bronze

Unnamed: 0,Code,Country,Total
0,USA,United States,13
1,GER,Germany,7
2,NOR,Norway,7
3,AUS,Austria,6


In [25]:
pd.merge(gold,bronze)

Unnamed: 0,Code,Country,Total


In [27]:
pd.merge(gold, bronze, on=['Code', 'Country'], suffixes=['_gold', '_bronze'])

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,GER,Germany,10,7
1,USA,United States,9,13
2,NOR,Norway,9,7


### Different types of joins

In [29]:
pd.merge(gold, bronze, on=['Code', 'Country'], suffixes=['_gold', '_bronze'], how='outer')

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,CAN,Canada,14.0,
1,GER,Germany,10.0,7.0
2,USA,United States,9.0,13.0
3,NOR,Norway,9.0,7.0
4,AUS,Austria,,6.0


In [30]:
pd.merge(gold, bronze, on=['Code', 'Country'], suffixes=['_gold', '_bronze'], how='left')

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,CAN,Canada,14,
1,GER,Germany,10,7.0
2,USA,United States,9,13.0
3,NOR,Norway,9,7.0


In [31]:
pd.merge(gold, bronze, on=['Code', 'Country'], suffixes=['_gold', '_bronze'], how='right')

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,GER,Germany,10.0,7
1,USA,United States,9.0,13
2,NOR,Norway,9.0,7
3,AUS,Austria,,6


In [34]:
df1 = pd.DataFrame({'key': [1,2,3,4], 'val1': [1,2,3,4]})
df2 = pd.DataFrame({'key': [1,2,3,5], 'val2': [1,2,3,4]})
df2

Unnamed: 0,key,val2
0,1,1
1,2,2
2,3,3
3,5,4


In [35]:
df_in = df1.merge(df2, how='inner')
df_in

Unnamed: 0,key,val1,val2
0,1,1,1
1,2,2,2
2,3,3,3


In [36]:
df_out = df1.merge(df2, how='outer')
df_out

Unnamed: 0,key,val1,val2
0,1,1.0,1.0
1,2,2.0,2.0
2,3,3.0,3.0
3,4,4.0,
4,5,,4.0
