## Pandas

In [1]:
import numpy as np
import pandas as pd

### Pandas Series

In [2]:
x = ['a','b','c','d','e']

In [3]:
y = [1,2,3,4,5]

In [4]:
z = {1: 'a', 2:'b', 3:'c', 4: 'd', 5:'e'}

In [5]:
pd.Series(data= x)

0    a
1    b
2    c
3    d
4    e
dtype: object

In [6]:
pd.Series(data = x, index = y)

1    a
2    b
3    c
4    d
5    e
dtype: object

In [7]:
pd.Series(data = y, index = x)

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [8]:
a = pd.Series(data = x, index = y)
a

1    a
2    b
3    c
4    d
5    e
dtype: object

In [9]:
b = pd.Series(data = x, index = y)
b

1    a
2    b
3    c
4    d
5    e
dtype: object

In [10]:
a+b

1    aa
2    bb
3    cc
4    dd
5    ee
dtype: object

In [11]:
a = pd.Series(data = y, index = x)
b = pd.Series(data = y, index = x)
a+b

a     2
b     4
c     6
d     8
e    10
dtype: int64

In [12]:
A = [1,2,3,4]
B = [5,6,7,8]
C = [9, 0, 1, 2]
D = [3,4,5,6]
E = [7,8,9,0]


In [13]:
df = pd.DataFrame([A,B,C,D,E],['a','b','c','d','e'],['w','x','y','z']) # first list -> Data, second list - Row indexes
# third list ->Column names.

In [14]:
df

Unnamed: 0,w,x,y,z
a,1,2,3,4
b,5,6,7,8
c,9,0,1,2
d,3,4,5,6
e,7,8,9,0


### Adding a new column 

In [17]:
df['p'] = df['y'] + df['z']

In [18]:
df

Unnamed: 0,w,x,y,z,P,p
a,1,2,3,4,7,7
b,5,6,7,8,15,15
c,9,0,1,2,3,3
d,3,4,5,6,11,11
e,7,8,9,0,9,9


In [23]:
df.drop('P', axis=1, inplace=True)

In [24]:
df

Unnamed: 0,w,x,y,z,p
a,1,2,3,4,7
b,5,6,7,8,15
c,9,0,1,2,3
d,3,4,5,6,11
e,7,8,9,0,9


In [25]:
df.drop('e')

Unnamed: 0,w,x,y,z,p
a,1,2,3,4,7
b,5,6,7,8,15
c,9,0,1,2,3
d,3,4,5,6,11


In [26]:
df

Unnamed: 0,w,x,y,z,p
a,1,2,3,4,7
b,5,6,7,8,15
c,9,0,1,2,3
d,3,4,5,6,11
e,7,8,9,0,9


In [35]:
df.drop('e', inplace=True) # removing row.

## Accesing elements in a Dataframe

In [38]:
# Accessing a single Column.
print(df)
df['y']

   w  x  y  z   p
a  1  2  3  4   7
b  5  6  7  8  15
c  9  0  1  2   3
d  3  4  5  6  11


a    3
b    7
c    1
d    5
Name: y, dtype: int64

In [39]:
# to access a single row
df.loc['a']

w    1
x    2
y    3
z    4
p    7
Name: a, dtype: int64

In [41]:
# if you want to access rows using numeric indexes.
print(df.iloc[1])
print(df.iloc[2])

w     5
x     6
y     7
z     8
p    15
Name: b, dtype: int64
w    9
x    0
y    1
z    2
p    3
Name: c, dtype: int64


In [46]:
# to access a particular element.
print(df)
# access 5 - we need row index and column name => df.loc['index', 'column name']
print(f"\n Element found {df.loc['d','y']}")

   w  x  y  z   p
a  1  2  3  4   7
b  5  6  7  8  15
c  9  0  1  2   3
d  3  4  5  6  11

 Element found 5


### Conditional accessing

In [49]:
# to use condition on the data frame
print(df > 3)
# the above statement will return a true, false table were all the elements greater than 3 will be true and all the elements
# that are less than 3 will be displayed as false
df == 3 # only 3 is true.

       w      x      y      z      p
a  False  False  False   True   True
b   True   True   True   True   True
c   True  False  False  False  False
d  False   True   True   True   True


Unnamed: 0,w,x,y,z,p
a,False,False,True,False,False
b,False,False,False,False,False
c,False,False,False,False,True
d,True,False,False,False,False


In [50]:
# to get values instead of true, false value 
df[df > 3]
# will return all the element's values that are greater than 3 as floats. and all values that are less 
# than 3 will be returned as 'NaN'

Unnamed: 0,w,x,y,z,p
a,,,,4.0,7.0
b,5.0,6.0,7.0,8.0,15.0
c,9.0,,,,
d,,4.0,5.0,6.0,11.0


In [52]:
# To access column that have satified the condition.
df[df['w'] > 3]

# will return the rows for which the column w have value greater than 3

Unnamed: 0,w,x,y,z,p
b,5,6,7,8,15
c,9,0,1,2,3


In [55]:
# to access only particular column data.
print(df[df['w'] > 3][['w','x']]) # w, x are column names.
print()
print(df[df['w'] > 3][['w']])


   w  x
b  5  6
c  9  0

   w
b  5
c  9


### usage of &, |

In [60]:
print(df)
df[(df['w']>3) & (df['z'] > 2)]

   w  x  y  z   p
a  1  2  3  4   7
b  5  6  7  8  15
c  9  0  1  2   3
d  3  4  5  6  11


Unnamed: 0,w,x,y,z,p
b,5,6,7,8,15


In [63]:
print(df)
df[(df['w']>3) | (df['z'] > 4)]

   w  x  y  z   p
a  1  2  3  4   7
b  5  6  7  8  15
c  9  0  1  2   3
d  3  4  5  6  11


Unnamed: 0,w,x,y,z,p
b,5,6,7,8,15
c,9,0,1,2,3
d,3,4,5,6,11


## Handling Missing data

In [65]:
import numpy as np

In [67]:
d = {'a':[1,2,3,4,5], 'b':[6,7,8,9,np.nan], 'c':[0,1,2,np.nan, np.nan], 'd':[3,4,np.nan, np.nan, np.nan], 'e':[5, np.nan, np.nan, np.nan, np.nan]}

In [81]:
d

{'a': [1, 2, 3, 4, 5],
 'b': [6, 7, 8, 9, nan],
 'c': [0, 1, 2, nan, nan],
 'd': [3, 4, nan, nan, nan],
 'e': [5, nan, nan, nan, nan]}

In [82]:
df = pd.DataFrame(d)

In [83]:
df

Unnamed: 0,a,b,c,d,e
0,1,6.0,0.0,3.0,5.0
1,2,7.0,1.0,4.0,
2,3,8.0,2.0,,
3,4,9.0,,,
4,5,,,,


In [84]:
# to drop alldata that are null.
df.dropna() # to remove entirely from a dataframe use inplace = true.

Unnamed: 0,a,b,c,d,e
0,1,6.0,0.0,3.0,5.0


In [85]:
df

Unnamed: 0,a,b,c,d,e
0,1,6.0,0.0,3.0,5.0
1,2,7.0,1.0,4.0,
2,3,8.0,2.0,,
3,4,9.0,,,
4,5,,,,


In [86]:
# Using threh hold to select data.
# ie- we can set how much data we need in a data frame as a not null value and those rows which satisfies this condition will be selected.
df.dropna(thresh=3) # selects row that have atleast 3 not null values.

Unnamed: 0,a,b,c,d,e
0,1,6.0,0.0,3.0,5.0
1,2,7.0,1.0,4.0,
2,3,8.0,2.0,,


In [87]:
# Using threh hold to select data.
# ie- we can set how much data we need in a data frame as a not null value and those rows which satisfies this condition will be selected.
df.dropna(thresh=5) # selects row that have atleast 5 not null values.

Unnamed: 0,a,b,c,d,e
0,1,6.0,0.0,3.0,5.0


### To fill a missing value

In [89]:
df.fillna(1) # fills all the missing values with 1

Unnamed: 0,a,b,c,d,e
0,1,6.0,0.0,3.0,5.0
1,2,7.0,1.0,4.0,1.0
2,3,8.0,2.0,1.0,1.0
3,4,9.0,1.0,1.0,1.0
4,5,1.0,1.0,1.0,1.0


In [None]:
# To fill using average(mean) values.
