# 1. Deletion method

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [None]:
# Read a data
house=fetch_california_housing()


In [None]:
# Convert it into pandas
df=pd.DataFrame(house.data, columns=house.feature_names)
df.head()

In [None]:
# Preprocess the data
# 1. Check for the missing values
df.isnull().sum()

In [None]:
# making df with every 10th row=Nan (missing value)
import numpy as np
df.iloc[::10]=np.nan

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# 1st method drop 
df=df.drona()

In [None]:
df.isnull().sum()

In [None]:
# Task is : Load iris data, put missing values, handle it using 1st deletion

# 2. Mean/Median Imputation method

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer 

In [None]:
# Read data
data=fetch_california_housing()

In [None]:
# Convert it into dataframe
df=pd.DataFrame(data.data, columns=data.feature_names)
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# put some missing values into the dataframe
df.iloc[::10]=np.nan

In [None]:
df.isnull().sum()

In [None]:
# Fill missing values with the mean of the column
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(df)

In [None]:
# Convert the imputed data back to a Pandas DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

In [None]:
df_imputed.isnull().sum()

# 3. Regression imputation method

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

In [None]:
# Read the dataset
data=fetch_california_housing()
df=pd.DataFrame(data.data, columns=data.feature_names)
df.head()

In [None]:
df.isnull().sum()

In [None]:
# Put some missing data
df.iloc[10:20, 0]=np.nan

In [None]:
df.isnull().sum()

In [None]:
# Split the dataset into with and without missing values
x_missing=df[df.isna().any(axis=1)] 
x_no_missing=df.dropna()

In [None]:
x_train=x_no_missing.drop(columns=['MedInc'])
y_train=x_no_missing['MedInc']
x_test=x_missing.drop(columns=['MedInc'])

In [None]:
# Define a model
model=LinearRegression()
model.fit(x_train,y_train) # Fit a model with features and targetr as MedInc

In [None]:
# Impute missing values using the trained model
x_missing['MedInc']=model.predict(x_test)


In [None]:
# Concatenate the two datasets back together
x_imputed = pd.concat([x_missing, x_no_missing], axis=0)

In [None]:
# Check if there are any missing values left
print(x_imputed.isnull().sum())

# 4 Using Interpolation method


In [None]:
from sklearn.datasets import load_iris
import pandas as pd

In [None]:
# Load Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names) # features
y = data.target # target

In [None]:
# Add some missing values
X.iloc[10:20, 0] = None
X.iloc[20:30, 2] = None

In [None]:
X.isnull().sum()

In [None]:
# Interpolate missing values using linear interpolation
X = X.interpolate(method='linear') # simple

In [None]:
X.isnull().sum()

# 5. Using multiple interpolation methods

In [None]:
from sklearn.datasets import load_iris
import pandas as pd

In [None]:
# Read dataset
x=load_iris()
y=x.target

In [None]:
# Convert it into dataframe
x=pd.DataFrame(x.data, columns=x.feature_names)

In [None]:
x.head()

In [None]:
x.isnull().sum()

In [None]:
# Put some missing values
x.iloc[10:20,0]=None

In [None]:
x.isnull().sum()

In [None]:
# Interpolate missing values using different interpolation methods
x['sepal length (cm)'] = x['sepal length (cm)'].interpolate(method='linear')
x['sepal width (cm)'] = x['sepal width (cm)'].interpolate(method='quadratic')

In [None]:
x.isnull().sum()

# 6. Multiple Imputation technique


In [None]:
from sklearn.datasets import load_iris
from sklearn.experimental import enable_iterative_imputer # iteration 
from sklearn.impute import IterativeImputer # mutltiple imputer
import pandas as pd

In [None]:
# Load Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)

In [None]:
# Add some missing values
X.iloc[10:20, 0] = None

# Impute missing values using IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imp.fit_transform(X)

# Check if there are any missing values left
pd.DataFrame(X_imputed).isnull().sum()