# Operations

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:

In [52]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


### Info on Unique Values

In [53]:
df['col2'].unique()

array([444, 555, 666])

In [54]:
df['col2'].nunique()

3

In [55]:
df['col2'].value_counts()

444    2
666    1
555    1
Name: col2, dtype: int64

### Selecting Data

In [56]:
#Select from DataFrame using criteria from multiple columns
newdf = df[(df['col1']>2) & (df['col2']==444)]

In [57]:
newdf

Unnamed: 0,col1,col2,col3
3,4,444,xyz


### Applying Functions

In [58]:
#DataFrame: “index” (axis=0, default), “columns” (axis=1)
df.mean(0)

col1      2.50
col2    527.25
dtype: float64

In [59]:
df['col1'].mean()

2.5

In [60]:
df.mean(1)

0    222.5
1    278.5
2    334.5
3    224.0
dtype: float64

In [67]:
df_mean = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':[17,16,16,15]})
df_mean

Unnamed: 0,col1,col2,col3
0,1,444,17
1,2,555,16
2,3,666,16
3,4,444,15


In [70]:
df_mean.mean(1)

0    154.000000
1    191.000000
2    228.333333
3    154.333333
dtype: float64

In [69]:
df_mean.loc[1].mean()

191.0

In [9]:
def times2(x):
    return x*2

In [10]:
df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [11]:
df['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [12]:
df['col1'].sum()

10

In [13]:
#All such methods have a skipna option signaling whether to exclude missing data (True by default):
df.sum(0, skipna=False)

col1              10
col2            2109
col3    abcdefghixyz
dtype: object

In [14]:
df.sum(0, skipna=True)

col1              10
col2            2109
col3    abcdefghixyz
dtype: object

In [15]:
df.sum(1, skipna=False)

0    445
1    557
2    669
3    448
dtype: int64

In [16]:
#Series
#The idxmin() and idxmax() functions on Series and DataFrame compute the index labels with the 
#minimum and maximum corresponding values:
series_1 = pd.Series(np.random.randn(5))
series_1

0    2.021723
1   -1.266987
2   -0.145123
3   -0.081085
4    0.745047
dtype: float64

In [17]:
series_1.idxmin(), series_1.idxmax()

(1, 0)

In [18]:
df_1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C'])
df_1

Unnamed: 0,A,B,C
0,-0.905266,-0.187899,-1.589914
1,1.157729,0.010683,0.874847
2,0.035893,0.069673,0.361455
3,-1.888611,1.252942,1.094449
4,0.647308,0.196285,-0.318056


In [19]:
df_1.idxmin(axis=0)

A    3
B    0
C    0
dtype: int64

In [20]:
df_1.idxmax(axis=1)

0    B
1    A
2    C
3    B
4    A
dtype: object

In [21]:
df_3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=["A"], index=list("edcba"))
df_3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [22]:
df_3['A'].idxmin()

'd'

In [23]:
#Note: idxminandidxmaxarecalledargminandargmaxinNumPy.

In [24]:
#value_counts()
data = np.random.randint(0, 7, size=50)
data

array([2, 0, 4, 1, 3, 5, 2, 2, 4, 0, 5, 3, 1, 1, 0, 6, 6, 2, 6, 3, 4, 1,
       3, 5, 2, 2, 1, 4, 5, 0, 4, 0, 3, 4, 6, 6, 4, 2, 4, 3, 5, 1, 1, 6,
       5, 6, 3, 1, 5, 0])

In [25]:
s = pd.Series(data)
s.value_counts()

1    8
4    8
2    7
3    7
5    7
6    7
0    6
dtype: int64

** Permanently Removing a Column**

In [26]:
del df['col1']

In [27]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


** Get column and index names: **

In [28]:
df.columns

Index(['col2', 'col3'], dtype='object')

In [29]:
df.index

RangeIndex(start=0, stop=4, step=1)

** Sorting and Ordering a DataFrame:**

In [30]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


In [31]:
df.sort_values(by='col2') #inplace=False by default

Unnamed: 0,col2,col3
0,444,abc
3,444,xyz
1,555,def
2,666,ghi


** Find Null Values or Check for Null Values**

In [32]:
df.isnull()

Unnamed: 0,col2,col3
0,False,False
1,False,False
2,False,False
3,False,False


In [33]:
# Drop rows with NaN Values
df.dropna()

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


** Filling in NaN values with something else: **

In [34]:
import numpy as np

In [35]:
df = pd.DataFrame({'col1':[1,2,3,np.nan],
                   'col2':[np.nan,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [36]:
df.fillna('FILL')

Unnamed: 0,col1,col2,col3
0,1.0,FILL,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,FILL,444.0,xyz


In [37]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

In [38]:
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [39]:
df.pivot_table(values='D',index=['A', 'B'],columns=['C'])

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,


### Elementwise NumPy ufuncs (log, exp, sqrt, . . . ) and various other NumPy functions can be used with no issues on Series and DataFrame, assuming the data within are numeric:

In [40]:
df_1 = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df_1

Unnamed: 0,A,B,C,D
0,2.115034,-0.230244,1.681294,-1.460312
1,-0.840818,0.041727,-0.168213,0.102716
2,-1.944614,1.380868,1.617119,-0.629407
3,-0.122395,-0.283371,1.770551,-0.349495
4,-0.760554,1.85685,-0.290618,-0.965851
5,-0.023317,0.131732,-1.145243,0.647802
6,0.248575,0.343963,-1.890602,0.059719
7,-0.364705,-1.48463,-1.307817,1.09969
8,1.024633,-1.226077,-0.288475,1.032031
9,-0.943817,-0.239915,1.599701,-1.30905


In [41]:
np.exp(df_1)

Unnamed: 0,A,B,C,D
0,8.289866,0.79434,5.372504,0.232164
1,0.431357,1.04261,0.845174,1.108177
2,0.143042,3.978355,5.038551,0.532908
3,0.884798,0.753241,5.874092,0.705044
4,0.467408,6.403532,0.747802,0.380659
5,0.976953,1.140802,0.318147,1.911335
6,1.282197,1.410526,0.150981,1.061539
7,0.694402,0.226586,0.27041,3.003236
8,2.786074,0.293442,0.749405,2.806762
9,0.38914,0.786694,4.951551,0.270077


In [42]:
df_1 + df_1

Unnamed: 0,A,B,C,D
0,4.230068,-0.460488,3.362588,-2.920624
1,-1.681637,0.083454,-0.336425,0.205432
2,-3.889228,2.761737,3.234237,-1.258814
3,-0.244791,-0.566741,3.541103,-0.698991
4,-1.521108,3.713699,-0.581235,-1.931702
5,-0.046634,0.263463,-2.290485,1.295604
6,0.49715,0.687925,-3.781204,0.119439
7,-0.729409,-2.96926,-2.615634,2.199381
8,2.049267,-2.452153,-0.576951,2.064063
9,-1.887634,-0.479831,3.199402,-2.6181


In [43]:
1/df_1

Unnamed: 0,A,B,C,D
0,0.472806,-4.343217,0.59478,-0.684785
1,-1.189317,23.96519,-5.944857,9.735573
2,-0.514241,0.724182,0.618384,-1.588797
3,-8.170241,-3.528946,0.564796,-2.861268
4,-1.314831,0.538547,-3.440948,-1.035356
5,-42.886896,7.591193,-0.873177,1.543682
6,4.02293,2.907292,-0.528932,16.744985
7,-2.741945,-0.673569,-0.764633,0.909347
8,0.975959,-0.81561,-3.466502,0.968963
9,-1.059527,-4.168138,0.625117,-0.763913


In [44]:
df_1 - df_1.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-2.955852,0.271971,-1.849507,1.563028
2,-4.059648,1.611113,-0.064175,0.830905
3,-2.237429,-0.053127,0.089257,1.110817
4,-2.875588,2.087094,-1.971912,0.494461
5,-2.138351,0.361976,-2.826537,2.108114
6,-1.866459,0.574207,-3.571896,1.520032
7,-2.479738,-1.254386,-2.989111,2.560003
8,-1.090401,-0.995833,-1.969769,2.492344
9,-3.058851,-0.009671,-0.081593,0.151262


In [45]:
#Boolean operators work as well:

df_bool1 = pd.DataFrame({'a': [1,0,0], 'b': [1,1,0]}, dtype=bool)
df_bool2 = pd.DataFrame({'a': [0,1,1], 'b': [1,0,1]}, dtype=bool)

df_bool1 & df_bool2

Unnamed: 0,a,b
0,False,True
1,False,False
2,False,False


In [46]:
df_bool1 | df_bool2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [47]:
np.asarray(df_1)

array([[ 2.11503386, -0.23024409,  1.681294  , -1.46031219],
       [-0.84081842,  0.04172719, -0.16821262,  0.10271609],
       [-1.94461385,  1.38086846,  1.61711853, -0.62940706],
       [-0.12239541, -0.28337071,  1.77055148, -0.34949541],
       [-0.76055378,  1.85684969, -0.29061759, -0.96585099],
       [-0.02331715,  0.13173161, -1.1452426 ,  0.64780194],
       [ 0.24857507,  0.34396275, -1.89060191,  0.05971937],
       [-0.36470461, -1.48462995, -1.30781688,  1.09969041],
       [ 1.02463335, -1.22607671, -0.28847528,  1.0320315 ],
       [-0.94381708, -0.23991531,  1.59970081, -1.30904995]])

# Great Job!