In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,0,0.658546,3,X,0.546451
1,1,0.091238,3,Y,0.071254
2,2,0.310355,2,Y,-2.024788
3,3,0.618982,3,Y,1.499762
4,4,0.923229,1,Z,-0.980191


In [4]:
data.isnull().sum()

Unnamed: 0    0
A             0
B             0
C             0
D             0
dtype: int64

# Handling missing data
# You can fill missing values with a specific value, like 0

In [12]:
df_filled = data.fillna(0)

# Or you can drop rows with missing values


In [13]:
df_dropped = data.dropna()

# Forward Fill (ffill) and Backward Fill (bfill):
Forward fill (ffill) replaces missing values with the previous non-missing value in the column. (10,20,nan,30) 20,20...                               
Backward fill (bfill) replaces missing values with the next non-missing value in the column. (10,20,nan,30)  30,30...

In [15]:
f_filled_df = data.fillna(method='ffill')

In [16]:
b_filled_df = data.fillna(method='bfill')

# Mean/Median/Mode Imputation:
Fill missing values with the mean,                                 
median,                           
mode.

In [51]:

# Select only numeric columns
numeric_columns = df.select_dtypes(include=[np.number])

# Calculate the mean for each numeric column
mean_values = numeric_columns.median()

# Fill missing values in numeric columns with their respective means
numeric_columns.fillna(mean_values, inplace=True)

# Custom Imputation:
You can fill missing values with a custom value or a value based on your domain knowledge.

In [28]:
# custom filling
data.fillna(-1)
# Fill with a Value Based on Domain Knowledge
a = data['A'].fillna(10,inplace=True)

# Using fillna with method='pad':
Similar to forward fill, this method replaces missing values with the previous non-missing value in the column.

In [30]:
# Fill missing values using 'pad' method (same as forward fill)
df_pad_filled = data.fillna(method='pad')

# Using fillna with a Function:
You can create a custom function to fill missing values based on your criteria.

In [31]:
data.fillna(lambda x: x.mean() if x.name=="A" else x.mode.loc[0])

Unnamed: 0,A,B,C,D
0,0.658546,3.0,X,0.546451
1,0.091238,3.0,Y,0.071254
2,0.310355,2.0,Y,-2.024788
3,10.000000,<function <lambda> at 0x000001AE9DB67820>,<function <lambda> at 0x000001AE9DB67820>,1.499762
4,0.923229,1.0,Z,-0.980191
...,...,...,...,...
95,0.182990,2.0,Z,-1.660145
96,0.556930,3.0,Y,0.385849
97,0.820939,3.0,X,-0.813855
98,0.326961,3.0,Y,0.795704


# Interpolation:
Interpolation methods estimate missing values based on the values before and after the missing data point.                                          
Linear interpolation is a common method, but you can use others like polynomial or spline interpolation.

# Linear Interpolation

In [18]:
linear_df = data.interpolate(method='linear')

# Polynomial Interpolation (e.g., degree 2)

In [20]:
poly_df = data.interpolate(method='polynomial',order=2)

In [21]:
poly_df.isnull().sum()

A     0
B     0
C    20
D     0
dtype: int64

# Spline Interpolation

In [22]:
spline = data.interpolate(method="spline",order=2)

# Custom python code

In [45]:
import pandas as pd
import numpy as np

df = pd.read_csv('data1.csv')
# Custom function to fill missing values in column 'A' with the mean of the column
def fill_missing_with_mean(df, column_name):
    mean_value = df[column_name].mean()
    df[column_name].fillna(mean_value, inplace=True)

# Apply the custom function to fill missing values in column 'A'
fill_missing_with_mean(df, 'A')


   Unnamed: 0    A    B  C
0           0  1.0 -1.0  1
1           1  2.0  2.0  2
2           2  3.0  3.0  3
3           3  4.0  4.0  4
4           4  5.0  5.0  5


In [46]:
# Custom function to fill missing values in column 'B' with a specific value (e.g., -1)
def fill_missing_with_value(df, column_name, value):
    df[column_name].fillna(value, inplace=True)
# Apply the custom function to fill missing values in column 'B'
fill_missing_with_value(df, 'B', -1)

In [42]:
# Custom function to drop rows with any missing values
def drop_rows_with_missing(df):
    df.dropna(axis=0, how='any', inplace=True)

# Apply the custom function to drop rows with missing values
drop_rows_with_missing(df)

A    0
B    0
C    0
D    0
dtype: int64

In [47]:
# Custom function to fill missing values in column 'C' with the mode of the column
def fill_missing_with_mode(df, column_name):
    mode_value = df[column_name].mode().iloc[0]
    df[column_name].fillna(mode_value, inplace=True)

# Apply the custom function to fill missing values in column 'C'
fill_missing_with_mode(df, 'C')
