In [1]:
# Working with Missing Data in Pandas
# Missing Data can occur when no information is provided for one or more
# items or for a whole unit. Missing Data is a very big problem in a real-life scenarios.
# Missing Data can also refer to as NA(Not Available) values in pandas.
# In DataFrame sometimes many datasets simply arrive with missing data,
# either because it exists and was not collected or it never existed.
# For Example, Suppose different users being surveyed may choose not to share their income,
# some users may choose not to share the address in this way many datasets went missing.
 
# Dataset 1 : https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv
 
# Dataset 2 : https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv



In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
ufo = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv')
ufo.head(10)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00
7,Alma,,DISK,MI,7/15/1936 0:00
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
9,Hubbard,,CYLINDER,OR,6/15/1937 0:00


In [3]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18241 entries, 0 to 18240
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   City             18215 non-null  object
 1   Colors Reported  2882 non-null   object
 2   Shape Reported   15597 non-null  object
 3   State            18241 non-null  object
 4   Time             18241 non-null  object
dtypes: object(5)
memory usage: 712.7+ KB


In [4]:
# check for missing values



In [5]:
# Removing missing values
# dropna()

ufo.dropna().shape


(2486, 5)

In [6]:
# fillna()
# Fill Na/NaN values with self entered values

ufo['City'].fillna(value='Unknown City',inplace=True)
ufo['Colors Reported'].fillna(value='Unknown Color',inplace=True)
ufo['Shape Reported'].fillna(value='Unknown Shape',inplace=True)

In [7]:
ufo.head(5)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,Unknown Color,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,Unknown Color,OTHER,NJ,6/30/1930 20:00
2,Holyoke,Unknown Color,OVAL,CO,2/15/1931 14:00
3,Abilene,Unknown Color,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,Unknown Color,LIGHT,NY,4/18/1933 19:00


In [8]:
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Id          7 non-null      int64  
 1   Name        7 non-null      object 
 2   Marks       5 non-null      float64
 3   Percentage  3 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 356.0+ bytes


In [9]:
data.isnull()

Unnamed: 0,Id,Name,Marks,Percentage
0,False,False,False,False
1,False,False,False,True
2,False,False,True,False
3,False,False,False,True
4,False,False,True,True
5,False,False,False,True
6,False,False,False,False


In [10]:
# fillna()
# backwards fill method    method='bfil'

data['Marks'].fillna(method='bfill',inplace=True)

In [11]:
# fillna()
# forward fill method  method='ffil'

data['Percentage'].fillna(method='ffill',inplace=True)

In [12]:
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,78.0
2,3,Alex,12.0,67.0
3,4,Alex,12.0,67.0
4,5,Alex,54.0,67.0
5,6,Alex,54.0,67.0
6,7,Alex,65.0,66.0


In [13]:
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')

In [14]:
mean_val_marks = data['Marks'].mean().round(1)
mean_percentage = data['Percentage'].mean().round(1)

data['Marks'].fillna(value=mean_val_marks,inplace=True)
data['Percentage'].fillna(value=mean_percentage,inplace=True)

In [15]:
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,70.3
2,3,Alex,46.4,67.0
3,4,Alex,12.0,70.3
4,5,Alex,46.4,70.3
5,6,Alex,54.0,70.3
6,7,Alex,65.0,66.0


In [17]:
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [18]:
# filling missing values using interpolate()

data.interpolate()

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,72.5
2,3,Alex,17.5,67.0
3,4,Alex,12.0,66.75
4,5,Alex,33.0,66.5
5,6,Alex,54.0,66.25
6,7,Alex,65.0,66.0


In [None]:
# Remove Duplicate Records
# Handle Duplicate Data
# Dealing With Duplicate Data
 
# Create a DataFrame
data = {
      'StudentName': ['Mark', 'Ali', 'Bob', 'John', 'Johny', 'Mark'],
      'Score': [45, 65, 76, 44, 39, 45]
}
df = pd.DataFrame(data)
df
 