# PANDAS - in depth (part1)


Data Manipulation comprises of following three stages:

   * Data preparation : we looked at various functions such as merge(), concat, combine, pivot etc for data preparation.
   * Data transformation
   * Data aggregation
   
In this lecture we will be looking at how we can perform the above operations using Pandas library.   

In [1]:
# Setting up working environment

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('max_columns', 50)
%matplotlib inline

In [2]:
frame1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'], 'price': [12.33,11.44,33.21,13.23,33.62]})

In [3]:
frame2 = pd.DataFrame({'id':['pencil','pencil','ball','pen'], 'color': ['white','red','red','black']})

In [4]:
frame1

Unnamed: 0,id,price
0,ball,12.33
1,pencil,11.44
2,pen,33.21
3,mug,13.23
4,ashtray,33.62


In [5]:
frame2

Unnamed: 0,id,color
0,pencil,white
1,pencil,red
2,ball,red
3,pen,black


In [6]:
# Merge the two dataframes

merged_frame = pd.merge(frame1,frame2)

In [7]:
merged_frame

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [8]:
# Example 2: When more than one column name matches in dataframes

frame1 = pd.DataFrame({'id':['ball','pencil','pen','mug','ashtray'],'color': ['white','red','red','black','green'],
'brand': ['OMG','ABC','ABC','POD','POD']})

In [9]:
frame1

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD


In [10]:
frame2 = pd.DataFrame({'id':['pencil','pencil','ball','pen'],'brand': ['OMG','POD','ABC','POD']})

In [11]:
frame2

Unnamed: 0,id,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [12]:
result=pd.merge(frame1,frame2)

In [13]:
result

Unnamed: 0,id,color,brand


In [14]:
# Since both columns of frame2 are present in frame1. Ambiguity !
# Results in empty dataframe

In [15]:
result.columns

Index(['id', 'color', 'brand'], dtype='object')

### Use ‘on’ option to explicitly define the criterion of merging that pandas must follow

In [16]:
# Merge on the basis of 'id' column

pd.merge(frame1,frame2,on='id')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD


In [17]:
# Merge on the basis of 'brand' column

pd.merge(frame1,frame2,on='brand')

Unnamed: 0,id_x,color,brand,id_y
0,ball,white,OMG,pencil
1,pencil,red,ABC,ball
2,pen,red,ABC,ball
3,mug,black,POD,pencil
4,mug,black,POD,pen
5,ashtray,green,POD,pencil
6,ashtray,green,POD,pen


### What if key columns in two DataFrames do not have the same name?

use the left_on and right_on options that specify the key column for the first and for the second DataFrame.

In [18]:
frame1

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD


In [19]:
# Lets use 'sid' to refer id in frame2

frame2 = pd.DataFrame({'sid':['pencil','pencil','ball','pen'],'brand': ['OMG','POD','ABC','POD']})

In [20]:
frame2

Unnamed: 0,sid,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [21]:
pd.merge(frame1, frame2, left_on='id', right_on='sid')

Unnamed: 0,id,color,brand_x,sid,brand_y
0,ball,white,OMG,ball,ABC
1,pencil,red,ABC,pencil,OMG
2,pencil,red,ABC,pencil,POD
3,pen,red,ABC,pen,POD


By default, the merge( ) function performs an inner join; the keys in the result are the result of an intersection.


## Using 'how' option to specify type of join operation

In [22]:
# Lets make both dataframe have the same 'id' key/column

frame2 = pd.DataFrame({'id':['pencil','pencil','ball','pen'],'brand': ['OMG','POD','ABC','POD']})

In [23]:
frame1

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD


In [24]:
frame2

Unnamed: 0,id,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [25]:
# Perfrom outer join on frame1 and frame2

pd.merge(frame1,frame2,on='id',how='outer')


# ensures all rows included from both frame even if they don’t match

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,


In [26]:
# Driving the merge from the left:

pd.merge(frame1,frame2,on='id',how='left')   


# all rows from frame1 and any rows from frame2 that match

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,


In [27]:
# From the right

pd.merge(frame1,frame2,on='id',how='right') 
         
# all rows from frame2 and any rows from frame1 that match

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD


### To merge multiple keys, simply add a list to the on option

In [28]:
frame1   # lets just print the contents of frame1 again

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD


In [29]:
frame2   # lets just print the contents of frame2 again

Unnamed: 0,id,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [30]:
pd.merge(frame1,frame2,on=['id','brand'],how='outer')  # Result of merge on two keys

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD
5,pencil,,OMG
6,pencil,,POD
7,ball,,ABC
8,pen,,POD


In [31]:
# TODO

# Look into the tutorials provided and copy paste code segment and play around

# Careful with the copy paste... especially with the single quote '' characters

# Concatenating


In [32]:
# NumPy has a concatenate function for concatenating arrays:

array1 = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
array2 = np.arange(9).reshape((3,3))+6


In [33]:
array1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [34]:
array2

array([[ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [35]:
array3 = np.concatenate([array1,array2],axis=1)   # horizontally - from left to right

In [36]:
array3

array([[ 0,  1,  2,  6,  7,  8],
       [ 3,  4,  5,  9, 10, 11],
       [ 6,  7,  8, 12, 13, 14]])

In [37]:
np.concatenate([array1,array2],axis=0)  # vertically - from top to bottom

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [38]:
# The Pandas concat() function

In [39]:
ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])
ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])

In [40]:
ser1

1    0.276526
2    0.642716
3    0.936159
4    0.883937
dtype: float64

In [41]:
ser2

5    0.966579
6    0.144375
7    0.112783
8    0.495908
dtype: float64

In [42]:
# concatenate the two series

ser3 = pd.concat([ser1,ser2])   # default behaviour

In [43]:
ser3

1    0.276526
2    0.642716
3    0.936159
4    0.883937
5    0.966579
6    0.144375
7    0.112783
8    0.495908
dtype: float64

   * By default, the concat() function works on axis = 0, returning a series object.
   * If you set the axis = 1, then the result will be a DataFrame.

In [44]:
ser3 = pd.concat([ser1,ser2],axis=1)   # horizontally - from left to right

In [45]:
ser3

Unnamed: 0,0,1
1,0.276526,
2,0.642716,
3,0.936159,
4,0.883937,
5,,0.966579
6,,0.144375
7,,0.112783
8,,0.495908


This has performed an outer join. It can be changed by setting the join option to 'inner’:

In [46]:
pd.concat([ser1,ser2],axis=1,join='inner')

Unnamed: 0,0,1


In [47]:
# what will be the output? 

#pd.concat([ser1,ser3],axis=1,join='inner')   # uncomment and run this cell
                                               # analyse what is happening here !!

In [48]:
# To create a hierarchical index on the axis of concatenation we need to use the keys option:

pd.concat([ser1,ser2], keys=[1,2])  # creating a hierarchical data structure !

1  1    0.276526
   2    0.642716
   3    0.936159
   4    0.883937
2  5    0.966579
   6    0.144375
   7    0.112783
   8    0.495908
dtype: float64

# Concatenating dataframes

In [49]:
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3], columns=['A','B','C'])
frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6], columns=['A','B','C'])

In [50]:
frame1

Unnamed: 0,A,B,C
1,0.476934,0.775173,0.988255
2,0.613056,0.865429,0.935006
3,0.050764,0.459149,0.367554


In [51]:
frame2

Unnamed: 0,A,B,C
4,0.126117,0.564177,0.072605
5,0.744772,0.899783,0.878953
6,0.037482,0.886418,0.321815


In [52]:
pd.concat([frame1, frame2])   # defaults to rows

Unnamed: 0,A,B,C
1,0.476934,0.775173,0.988255
2,0.613056,0.865429,0.935006
3,0.050764,0.459149,0.367554
4,0.126117,0.564177,0.072605
5,0.744772,0.899783,0.878953
6,0.037482,0.886418,0.321815


In [53]:
# Along the column-axis or horizontally from left to right

pd.concat([frame1, frame2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.476934,0.775173,0.988255,,,
2,0.613056,0.865429,0.935006,,,
3,0.050764,0.459149,0.367554,,,
4,,,,0.126117,0.564177,0.072605
5,,,,0.744772,0.899783,0.878953
6,,,,0.037482,0.886418,0.321815


# Combine()

If we wish the two datasets to have indexes that overlap in their entirety or at least partially, we can use
combine_first().

In [54]:
ser1 = pd.Series(np.random.rand(5),index=[1,2,3,4,5])
ser2 = pd.Series(np.random.rand(4),index=[2,4,5,6])

In [55]:
ser1

1    0.787886
2    0.536139
3    0.940740
4    0.526812
5    0.485319
dtype: float64

In [56]:
ser2

2    0.164325
4    0.022346
5    0.948824
6    0.107943
dtype: float64

In [57]:
ser1.combine_first(ser2)

1    0.787886
2    0.536139
3    0.940740
4    0.526812
5    0.485319
6    0.107943
dtype: float64

In [58]:
# If you want a partial overlap, you can specify only the portion of the Series you want to overlap.

ser1[:3].combine_first(ser2[:3])

1    0.787886
2    0.536139
3    0.940740
4    0.022346
5    0.948824
dtype: float64

# Pivoting with Hierarchical Indexing

In the context of pivoting there are two basic operations:
  * Stacking: rotates or pivots the data structure converting columns to rows
  * Unstacking: converts rows into columns

In [59]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3), index=['white','black','red'],columns=['ball','pen','pencil'])

In [60]:
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [61]:
# Using the stack() function on the DataFrame, pivots the columns into rows, thus producing a series:

ser5 = frame1.stack()

In [62]:
ser5

white  ball      0
       pen       1
       pencil    2
black  ball      3
       pen       4
       pencil    5
red    ball      6
       pen       7
       pencil    8
dtype: int64

In [63]:
# From this hierarchically indexed series, you can reassemble the DataFrame into a pivoted table 
# by use of the unstack() function.

ser5.unstack()   # simply preserves the original structure and shape
                

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [64]:
# You can also do the unstack on a different level, specifying the number of levels or its name as the
# argument of the function.

ser5.unstack(0)   # the 0th column (keys: white black and red) in ser5 is considered columns now in the resulting
                  # dataframe object
    

Unnamed: 0,white,black,red
ball,0,3,6
pen,1,4,7
pencil,2,5,8


In [65]:
# Try this
#ser5.unstack(1)         #uncomment and run this

# Further check
#ser5.unstack(2)

## Removing columns and rows

In [66]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3), index=['white','black','red'],columns=['ball','pen','pencil'])

In [67]:
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [68]:
# To remove a column, simply use the del command applied to the DataFrame with the column name specified

del frame1['ball']

In [69]:
frame1

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [70]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3), index=['white','black','red'],columns=['ball','pen','pencil'])

In [71]:
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [72]:
# To remove an unwanted row, you have to use the drop() function with the label of the corresponding
# index as argument

frame1.drop('white')

Unnamed: 0,ball,pen,pencil
black,3,4,5
red,6,7,8
