In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv")
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57,69,52
1,2,95,67,68
2,3,83,na,49
3,4,81,49,
4,5,,95,80
5,6,95,93,69
6,7,95,56,50
7,8,72,60,56
8,9,84,47,50
9,10,90,51,63


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv")
df.isnull() 
#it checks for any null values. Only NaN values.

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,True
4,False,True,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv")
df.isna() 

# it also checks for null values. isnull() and isna() does exactly the same thing! 
# Then why 2 different functions? 
# It's because This is because pandas' DataFrames are based on R's DataFrames. In R na and null are two separate things.
# pandas is built on top of numpy, which has neither na nor null values. 
# Instead numpy has NaN values (which stands for "Not a Number"). Therefore pandas also uses NaN values.

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,True
4,False,True,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv")
df.isnull().any() 
# it check for any null values. And returns true or false.

Roll No.      False
Assignment     True
Midterm        True
Final          True
dtype: bool

In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv")
df.isnull().sum() 
# Here isnull() only detects the NaN values. It doesnt detect na or N/a values.

Roll No.      0
Assignment    1
Midterm       1
Final         1
dtype: int64

## To handle the above problem we can do following:

In [6]:
import pandas as pd
import numpy as np

missing_values = ["N/a", "na", np.nan]
df = pd.read_csv("grades.csv", na_values = missing_values)
df.isnull().sum() 
# Now it recognizes all the types of null values

Roll No.      0
Assignment    2
Midterm       2
Final         2
dtype: int64

# How to clean this data. Following are some ways:

## Removing Rows - using dropna() we can remove the entire column consisting of an empty cell.

In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)

df2 = df.dropna()
df2

# The dropna() method returns a new DataFrame, and will not change the original.

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0
11,12,86.0,74.0,77.0
12,13,97.0,72.0,63.0
14,15,84.0,44.0,130.0


In [8]:
# If you want to change the original DataFrame, use the inplace = True argument.

import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)

df.dropna(inplace = True)
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0
11,12,86.0,74.0,77.0
12,13,97.0,72.0,63.0
14,15,84.0,44.0,130.0


In [9]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)

df.dropna(subset=['Final'], inplace = True)
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,,49.0
4,5,,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0
10,11,,75.0,39.0


In [10]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,,49.0
3,4,81.0,49.0,
4,5,,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0


## Replace Empty Cells - using fillna() we can insert a new value in place of empty cells. This function will fill NA/NaN or 0 values in place of null spaces.

In [11]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)
df.fillna(0, inplace = True)
df

# This will replace all empty cells in the data frame.

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,0.0,49.0
3,4,81.0,49.0,0.0
4,5,0.0,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0


In [12]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)
df["Final"].fillna(33, inplace = True)
df

# This will replace empty cells of specified column in the data frame.

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,,49.0
3,4,81.0,49.0,33.0
4,5,,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0


## We can also replace empty cells using mean(), median() and mode(). 

In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)
x = df["Assignment"].mean()
df["Assignment"].fillna(x, inplace = True)
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,,49.0
3,4,81.0,49.0,
4,5,83.166667,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0


In [14]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)
x = df["Midterm"].median()
df["Midterm"].fillna(x, inplace = True)
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,68.0,49.0
3,4,81.0,49.0,
4,5,,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0


In [15]:
import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)
x = df["Final"].mode()[0]
df["Final"].fillna(x, inplace = True)
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,,49.0
3,4,81.0,49.0,50.0
4,5,,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0


## Fixing the wrong data - it doesnt have to be empty cells or wrong format, it is incorrect data that we can figure out by looking at them. As we know what the data should look like.

### Replacing values 

In [17]:
# Here there is 130 marks in final column.l

import pandas as pd
import numpy as np

df = pd.read_csv("grades.csv", na_values = missing_values)
df.loc[14, 'Final'] = 33
df

Unnamed: 0,Roll No.,Assignment,Midterm,Final
0,1,57.0,69.0,52.0
1,2,95.0,67.0,68.0
2,3,83.0,,49.0
3,4,81.0,49.0,
4,5,,95.0,80.0
5,6,95.0,93.0,69.0
6,7,95.0,56.0,50.0
7,8,72.0,60.0,56.0
8,9,84.0,47.0,50.0
9,10,90.0,51.0,63.0
