# How to create python pandas dataframe

In [2]:
import pandas as pd

In [4]:
empty_df = pd.DataFrame()
print(empty_df)

Empty DataFrame
Columns: []
Index: []


In [None]:
# Creating dataframe from list

In [5]:
lst = ['a','b','c']
print(lst)

['a', 'b', 'c']


In [7]:
df1 = pd.DataFrame(lst)
print(df1)

   0
0  a
1  b
2  c


In [8]:
df1 # visualisation is different that print(df1) command

Unnamed: 0,0
0,a
1,b
2,c


In [9]:
ls_of_ls = [[1,2,3],[2,3,4],[4,5,6]]
print(ls_of_ls)

[[1, 2, 3], [2, 3, 4], [4, 5, 6]]


In [11]:
df1 = pd.DataFrame(ls_of_ls)
df1

Unnamed: 0,0,1,2
0,1,2,3
1,2,3,4
2,4,5,6


In [None]:
# Creating dataframe from dictionary

In [15]:
dict1 = {'ID': [11,22,33,44],'SN': [1,2,3,4]}
dict1
df = pd.DataFrame(dict1)
df

Unnamed: 0,ID,SN
0,11,1
1,22,2
2,33,3
3,44,4


In [18]:
# Crrating dataframe from list of dictionaries
ls_dict = [{'a':1,'b':2},{'a': 100, 'b':200}]
df = pd.DataFrame(ls_dict)
df

Unnamed: 0,a,b
0,1,2
1,100,200


In [19]:
# different key-value pairs list of dictionaries

ls_dict = [{'a':1,'b':2},{'a': 100, 'c':200}]
df = pd.DataFrame(ls_dict)
df

Unnamed: 0,a,b,c
0,1,2.0,
1,100,,200.0


In [21]:
# Creating dataframe from series
dict_str = {'ID':pd.Series([1,2,3]),'SN':pd.Series([101,20,30])}
df = pd.DataFrame(dict_str)
df

Unnamed: 0,ID,SN
0,1,101
1,2,20
2,3,30


# Merge

In [1]:
# Now we'll learn how to merge data sets by linking rows by keys.

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [6]:
# Let's make a dframe

dframe1 = DataFrame({'key':['X','Z','Y','Z','X','X'],'data_set_1': np.arange(6)})

#Show
dframe1

Unnamed: 0,data_set_1,key
0,0,X
1,1,Z
2,2,Y
3,3,Z
4,4,X
5,5,X


In [13]:
#Now lets make another dframe

dframe2 = DataFrame({'key':['Q','Y','Z'],'data_set_2':[1,2,3]})

#Show
dframe2

Unnamed: 0,data_set_2,key
0,1,Q
1,2,Y
2,3,Z


In [14]:
# Now we can use merge the dataframes, this is a "many-to-one" situation

# Merge will automatically choose overlapping columns to merge on
pd.merge(dframe1,dframe2)

#Note no overlapping 'X's

Unnamed: 0,data_set_1,key,data_set_2
0,1,Z,3
1,3,Z,3
2,2,Y,2


In [16]:
# We could have also specified which column to merge on
pd.merge(dframe1,dframe2,on='key')

Unnamed: 0,data_set_1,key,data_set_2
0,1,Z,3
1,3,Z,3
2,2,Y,2


In [17]:
# We can choose which DataFrame's keys to use, this will choose left (dframe1)
pd.merge(dframe1,dframe2,on='key',how='left')

Unnamed: 0,data_set_1,key,data_set_2
0,0,X,
1,1,Z,3.0
2,2,Y,2.0
3,3,Z,3.0
4,4,X,
5,5,X,


In [18]:
# Choosing the one on the right (dframe2)
pd.merge(dframe1,dframe2,on='key',how='right')

Unnamed: 0,data_set_1,key,data_set_2
0,1.0,Z,3
1,3.0,Z,3
2,2.0,Y,2
3,,Q,1


In [19]:
#Choosing the "outer" method selects the union of both keys
pd.merge(dframe1,dframe2,on='key',how='outer')

Unnamed: 0,data_set_1,key,data_set_2
0,0.0,X,
1,4.0,X,
2,5.0,X,
3,1.0,Z,3.0
4,3.0,Z,3.0
5,2.0,Y,2.0
6,,Q,1.0


In [30]:
#Now we'll learn about a many to many merge

# Nnote that these DataFrames contain more than one instance of the key in BOTH datasets

dframe3 = DataFrame({'key': ['X', 'X', 'X', 'Y', 'Z', 'Z'],
                 'data_set_3': range(6)})
dframe4 = DataFrame({'key': ['Y', 'Y', 'X', 'X', 'Z'],
                 'data_set_4': range(5)})

#Show the merge
pd.merge(dframe3, dframe4)


Unnamed: 0,data_set_3,key,data_set_4
0,0,X,2
1,0,X,3
2,1,X,2
3,1,X,3
4,2,X,2
5,2,X,3
6,3,Y,0
7,3,Y,1
8,4,Z,4
9,5,Z,4


So what happened? A many to many merge results in the product of the rows. Because there were 3 'X's in dframe3 and 2 'X's in dframe4 there ended up being a total of 6 'X' rows in the result (2*3=6)! Note how dframe3 repeats its 0,1,2 values for 'X' and dframe4 repeats its '2,3' pairs throughout the key set. 

In [33]:
# We can also merge with multiple keys!

# Dframe on left
df_left = DataFrame({'key1': ['SF', 'SF', 'LA'],
                  'key2': ['one', 'two', 'one'],
                  'left_data': [10,20,30]})

#Dframe on right
df_right = DataFrame({'key1': ['SF', 'SF', 'LA', 'LA'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'right_data': [40,50,60,70]})

#Merge
pd.merge(df_left, df_right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,left_data,right_data
0,SF,one,10.0,40.0
1,SF,one,10.0,50.0
2,SF,two,20.0,
3,LA,one,30.0,60.0
4,LA,two,,70.0


In [32]:
# Now using the above you can check mulitple data sets for multiple key combos, for instance what did the left data set have for LA,one?
# Answer =  60

In [35]:
#Note that the left and right DataFrames have overlapping key names (key1 and key2).
# pandas automatically adds suffixes to them

pd.merge(df_left,df_right,on='key1')

Unnamed: 0,key1,key2_x,left_data,key2_y,right_data
0,SF,one,10,one,40
1,SF,one,10,one,50
2,SF,two,20,one,40
3,SF,two,20,one,50
4,LA,one,30,one,60
5,LA,one,30,two,70


In [36]:
# We can also specify what the suffix becomes
pd.merge(df_left,df_right, on='key1',suffixes=('_lefty','_righty'))

Unnamed: 0,key1,key2_lefty,left_data,key2_righty,right_data
0,SF,one,10,one,40
1,SF,one,10,one,50
2,SF,two,20,one,40
3,SF,two,20,one,50
4,LA,one,30,one,60
5,LA,one,30,two,70


In [37]:
# For more info on merge parameters check out:
url = 'http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.merge.html'

# Next we'll learn how to merge on Index!

# Merge on Index

In [12]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
#Now we'll learn how to merge on an index

In [4]:
# Lets get two dframes

df_left = DataFrame({'key': ['X','Y','Z','X','Y'],
                  'data': range(5)})
df_right = DataFrame({'group_data': [10, 20]}, index=['X', 'Y'])

In [5]:
#Show
df_left

Unnamed: 0,data,key
0,0,X
1,1,Y
2,2,Z
3,3,X
4,4,Y


In [7]:
#Show
df_right

Unnamed: 0,group_data
X,10
Y,20


In [8]:
#Now merge, we'll use the key for the left Dframe, and the index for the right
pd.merge(df_left,df_right,left_on='key',right_index=True)

Unnamed: 0,data,key,group_data
0,0,X,10
3,3,X,10
1,1,Y,20
4,4,Y,20


In [10]:
# We can also get a union by using outer
pd.merge(df_left,df_right,left_on='key',right_index=True,how='outer')

Unnamed: 0,data,key,group_data
0,0,X,10.0
3,3,X,10.0
1,1,Y,20.0
4,4,Y,20.0
2,2,Z,


In [13]:
#Now let's try something a little more complicated, remember hierarchal index?
df_left_hr = DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'key2': [10, 20, 30, 20, 30],
                   'data_set': np.arange(5.)})
df_right_hr = DataFrame(np.arange(10).reshape((5, 2)),
                   index=[['LA','LA','SF','SF','SF'],
                          [20, 10, 10, 10, 20]],
                   columns=['col_1', 'col_2'])


In [14]:
#SHOW
df_left_hr

Unnamed: 0,data_set,key1,key2
0,0,SF,10
1,1,SF,20
2,2,SF,30
3,3,LA,20
4,4,LA,30


In [15]:
#Show, this has a index hierarchy
df_right_hr

Unnamed: 0,Unnamed: 1,col_1,col_2
LA,20,0,1
LA,10,2,3
SF,10,4,5
SF,10,6,7
SF,20,8,9


In [16]:
# Now we can merge the left by using keys and the right by its index
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True)

Unnamed: 0,data_set,key1,key2,col_1,col_2
0,0,SF,10,4,5
0,0,SF,10,6,7
1,1,SF,20,8,9
3,3,LA,20,0,1


In [17]:
# We can alo keep a union by choosing 'outer' method
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True,how='outer')

Unnamed: 0,data_set,key1,key2,col_1,col_2
0,0.0,SF,10,4.0,5.0
0,0.0,SF,10,6.0,7.0
1,1.0,SF,20,8.0,9.0
2,2.0,SF,30,,
3,3.0,LA,20,0.0,1.0
4,4.0,LA,30,,
4,,LA,10,2.0,3.0


In [23]:
# WE can also you .join()

# Shown on our first two DataFrames
df_left.join(df_right)

Unnamed: 0,data,key,group_data
0,0,X,
1,1,Y,
2,2,Z,
3,3,X,
4,4,Y,


# Concatenate

In [2]:
# Now we'll learn about concatenating along an axis
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
# First in just Numpy

In [4]:
# Create a matrix 
arr1 = np.arange(9).reshape((3,3))

In [5]:
# Show
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [6]:
# Concatenate along axis 1
np.concatenate([arr1,arr1],axis=1)

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5],
       [6, 7, 8, 6, 7, 8]])

In [7]:
# Let's see other axis options
np.concatenate([arr1,arr1],axis=0)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [8]:
# Now let's see how this works in pandas

In [9]:
# Lets create two Series with no overlap
ser1 =  Series([0,1,2],index=['T','U','V'])

ser2 = Series([3,4],index=['X','Y'])

#Now let use concat (default is axis=0)
pd.concat([ser1,ser2])

T    0
U    1
V    2
X    3
Y    4
dtype: int64

In [10]:
# Now passing along another axis will produce a DataFrame
pd.concat([ser1,ser2],axis=1)

Unnamed: 0,0,1
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [17]:
# We can specify which specific axes to be used
pd.concat([ser1,ser2],axis=1,join_axes=[['U','V','Y']])

Unnamed: 0,0,1
U,1.0,
V,2.0,
Y,,4.0


In [11]:
# Lets say we wanted to add markers.keys to the concatenation result

# WE can do this with a hierarchical index
pd.concat([ser1,ser2],keys=['cat1','cat2'])

cat1  T    0
      U    1
      V    2
cat2  X    3
      Y    4
dtype: int64

In [12]:
# Along the axis=1 then these Keys become column headers
pd.concat([ser1,ser2],axis=1,keys=['cat1','cat2'])

Unnamed: 0,cat1,cat2
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [14]:
#Lastly, everything works similarly in DataFrames

dframe1 = DataFrame(np.random.randn(4,3), columns=['X', 'Y', 'Z'])
dframe2 = DataFrame(np.random.randn(3, 3), columns=['Y', 'Q', 'X'])

In [16]:
#Concat on DataFrame
pd.concat([dframe1,dframe2])

Unnamed: 0,Q,X,Y,Z
0,,1.09604,-1.36698,0.546707
1,,-1.406425,0.484748,-1.156143
2,,1.155464,1.166407,-0.245477
3,,-0.15333,2.185743,0.307704
0,0.789881,1.616933,-0.96183,
1,0.201265,0.29321,-0.277847,
2,-0.121395,0.959849,-1.360611,


In [17]:
#If we dont care about the index info and just awnt to make a complete DataFrame, just use ignore_index
pd.concat([dframe1,dframe2],ignore_index=True)

Unnamed: 0,Q,X,Y,Z
0,,1.09604,-1.36698,0.546707
1,,-1.406425,0.484748,-1.156143
2,,1.155464,1.166407,-0.245477
3,,-0.15333,2.185743,0.307704
4,0.789881,1.616933,-0.96183,
5,0.201265,0.29321,-0.277847,
6,-0.121395,0.959849,-1.360611,


In [18]:
#For more info in documentation:
url='http://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html'

# Cmbining DataFrames

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [6]:
#Lets make some Series to work with

#First Series
ser1 = Series([2,np.nan,4,np.nan,6,np.nan],
           index=['Q','R','S','T','U','V'])

#Second Series (based off length of ser1)
ser2 = Series(np.arange(len(ser1), dtype=np.float64),
           index=['Q','R','S','T','U','V'])

ser2[-1] = np.nan

In [7]:
ser1

Q     2
R   NaN
S     4
T   NaN
U     6
V   NaN
dtype: float64

In [8]:
ser2

Q     0
R     1
S     2
T     3
U     4
V   NaN
dtype: float64

In [14]:
# Now let's get a series where the value of ser1 is chosen if ser2 is NAN,otherwise let the value be ser1
Series(np.where(pd.isnull(ser1),ser2,ser1),index=ser1.index)

Q     2
R     1
S     4
T     3
U     6
V   NaN
dtype: float64

In [11]:
#Take a moment to really understand how the above worked

In [21]:
#Now we can do the same thing simply by using combine_first with pandas
ser1.combine_first(ser2)

#This combines the Series values, choosing the values of the calling Series first, unless its a NAN

Q     2
R     1
S     4
T     3
U     6
V   NaN
dtype: float64

In [22]:
#Now lets how this works on a DataFrame!

In [34]:
#Lets make some 
dframe_odds = DataFrame({'X': [1., np.nan, 3., np.nan],
                     'Y': [np.nan, 5., np.nan, 7.],
                     'Z': [np.nan, 9., np.nan, 11.]})
dframe_evens = DataFrame({'X': [2., 4., np.nan, 6., 8.],
                     'Y': [np.nan, 10., 12., 14., 16.]})


In [35]:
#Show
dframe_odds

Unnamed: 0,X,Y,Z
0,1.0,,
1,,5.0,9.0
2,3.0,,
3,,7.0,11.0


In [36]:
#Show
dframe_evens

Unnamed: 0,X,Y
0,2.0,
1,4.0,10.0
2,,12.0
3,6.0,14.0
4,8.0,16.0


In [38]:
#Now lets combine using odds values first, unless theres a NAN, then put the evens values
dframe_odds.combine_first(dframe_evens)

Unnamed: 0,X,Y,Z
0,1,,
1,4,5.0,9.0
2,3,12.0,
3,6,7.0,11.0
4,8,16.0,
