In [66]:
import pandas as pd
import numpy as np

In [67]:
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3),
                  index=['a', 'b', 'c', 'd', 'e'],
                  columns=['c1', 'c2', 'c3'])

df

Unnamed: 0,c1,c2,c3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,12,13,14


In [68]:
df.isna()

Unnamed: 0,c1,c2,c3
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False
e,False,False,False


In [69]:
df.isnull()

Unnamed: 0,c1,c2,c3
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False
e,False,False,False


In [70]:
df['c4'] = np.nan # create a new column of nan values
df.loc['f'] = np.arange(15, 19) # create a new row of some values
df.loc['g'] = np.nan # create a new row of nan values
df['c5'] = np.nan # create a new column of nan values

df['c4'][0] = 20 # modify one of the column values
df

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['c4'][0] = 20 # modify one of the column values
  df['c4'][0] = 20 # modify one of the column values


Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [71]:
df.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [72]:
df.isna()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [73]:
df.isna().sum() # count the number of nan in each column

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [74]:
# gets the total number of nan values in the dataframe
df.isnull().sum().sum()

np.int64(15)

In [75]:
df.count() # number of non NA values in each column

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [76]:
df.notnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,True,True,True,True,False
b,True,True,True,False,False
c,True,True,True,False,False
d,True,True,True,False,False
e,True,True,True,False,False
f,True,True,True,True,False
g,False,False,False,False,False


#### Selecting out or dropping the NaN values from the dataframe

In [77]:
df.c4[df.c4.notnull()] # selects only non null values from column - C4

a    20.0
f    18.0
Name: c4, dtype: float64

In [78]:
df.c4.dropna() # drop the NA values from the column

a    20.0
f    18.0
Name: c4, dtype: float64

In [79]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [80]:
# drops all NA values from the dataframe, both rows and column even if there 
# is one NA value
df.dropna() 

Unnamed: 0,c1,c2,c3,c4,c5


In [81]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [82]:
# drops the rows, only when all values of that row is NaN
df.dropna(axis=0, how='all')

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,


In [83]:
# drops the column, only when all values of that column is NaN
df.dropna(axis=1, how='all')

Unnamed: 0,c1,c2,c3,c4
a,0.0,1.0,2.0,20.0
b,3.0,4.0,5.0,
c,6.0,7.0,8.0,
d,9.0,10.0,11.0,
e,12.0,13.0,14.0,
f,15.0,16.0,17.0,18.0
g,,,,


In [84]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [85]:
# only drop columns with atleast 3 Non-NaN values
df.dropna(thresh=3, axis=1)

Unnamed: 0,c1,c2,c3
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0
e,12.0,13.0,14.0
f,15.0,16.0,17.0
g,,,


#### Handling of missing values

In [86]:
arr = np.array([4, 2, 3, np.nan])
s = pd.Series(arr)

print(arr.sum()) # numpy does not ignore nan in mathematical operations
print(s.sum()) # pandas usually ignores nan in mathematical operations

nan
9.0


#### Filling of missing values 

In [87]:
filled = df.fillna(1)
filled

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,1.0
b,3.0,4.0,5.0,1.0,1.0
c,6.0,7.0,8.0,1.0,1.0
d,9.0,10.0,11.0,1.0,1.0
e,12.0,13.0,14.0,1.0,1.0
f,15.0,16.0,17.0,18.0,1.0
g,1.0,1.0,1.0,1.0,1.0


In [88]:
df.c4.fillna(method='ffill')

  df.c4.fillna(method='ffill')


a    20.0
b    20.0
c    20.0
d    20.0
e    20.0
f    18.0
g    18.0
Name: c4, dtype: float64

#### Handling duplicate data

In [89]:
dict = {'a': ['x'] * 3 + ['y'] * 4, 'b': [1, 1, 2, 3, 3, 4, 4]}
data = pd.DataFrame(dict)
data

Unnamed: 0,a,b
0,x,1
1,x,1
2,x,2
3,y,3
4,y,3
5,y,4
6,y,4


In [90]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [91]:
data.drop_duplicates(ignore_index=True)

Unnamed: 0,a,b
0,x,1
1,x,2
2,y,3
3,y,4


In [92]:
data.drop_duplicates(keep='last')

Unnamed: 0,a,b
1,x,1
2,x,2
4,y,3
6,y,4


In [93]:
data['c'] = range(7)
data

Unnamed: 0,a,b,c
0,x,1,0
1,x,1,1
2,x,2,2
3,y,3,3
4,y,3,4
5,y,4,5
6,y,4,6


In [94]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [95]:
data.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,c
0,x,1,0
2,x,2,2
3,y,3,3
5,y,4,5
