# Inverse coding/ Inverse transform 
Inverse encoding is the process of converting the encoded data back to its original form. This is done by using the inverse_transform() method. This method is available for the following encoders:

LabelEncoder\
OrdinalEncoder\
OneHotEncoder

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=sns.load_dataset('titanic')
df.rename(columns={'sex':'gender'},inplace=True)
df.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# impute missing values
df.drop("deck", axis=1, inplace=True)
df["age"]=df["age"].fillna(df["age"].mean())
df["embarked"]=df["embarked"].fillna(df["embarked"].mode()[0])
df["embark_town"]=df["embark_town"].fillna(df["embark_town"].mode()[0])

In [4]:
df.isnull().sum()

survived       0
pclass         0
gender         0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [5]:
le_gender=LabelEncoder()
le_class=LabelEncoder()
le_embark_town=LabelEncoder()
df["gender"]=le_gender.fit_transform(df["gender"])
df["class"]=le_class.fit_transform(df["class"])
df["embark_town"]=le_embark_town.fit_transform(df["embark_town"])

In [6]:
df.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,S,2,man,True,2,no,False
1,1,1,0,38.0,1,0,71.2833,C,0,woman,False,0,yes,False
2,1,3,0,26.0,0,0,7.925,S,2,woman,False,2,yes,True
3,1,1,0,35.0,1,0,53.1,S,0,woman,False,2,yes,False
4,0,3,1,35.0,0,0,8.05,S,2,man,True,2,no,True


In [7]:
# inverse transoform the encoded data
df["gender"]=le_gender.inverse_transform(df["gender"])
df["embark_town"]=le_embark_town.inverse_transform(df["embark_town"])
df["class"]=le_class.inverse_transform(df["class"])

In [8]:
df.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


# One hot encoding

In [9]:
df.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [10]:
encoder=OneHotEncoder()

In [11]:
df = sns.load_dataset('titanic')

cat_columns = ['sex', 'embarked']

encoder = OneHotEncoder(sparse=False)
encoded_df = pd.DataFrame(encoder.fit_transform(df[cat_columns]))
encoded_df.head()
# concatenate the dataframes 
df = pd.concat([df, encoded_df], axis=1)
# df.drop(cat_columns, axis=1, inplace=True)
df.head()




Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,deck,embark_town,alive,alone,0,1,2,3,4,5
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,,Southampton,no,False,0.0,1.0,0.0,0.0,1.0,0.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,C,Cherbourg,yes,False,1.0,0.0,1.0,0.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,,Southampton,yes,True,1.0,0.0,0.0,0.0,1.0,0.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,C,Southampton,yes,False,1.0,0.0,0.0,0.0,1.0,0.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,,Southampton,no,True,0.0,1.0,0.0,0.0,1.0,0.0


In [12]:
# We need to extract the original categories for each encoded column
original_categories = {col: encoder.categories_[i] for i, col in enumerate(cat_columns)}
# Manual creation of feature names
feature_names = []
for i, col in enumerate(cat_columns):
    for category in encoder.categories_[i]:
        feature_names.append(f"{col}_{category}")

encoded_df = pd.DataFrame(encoded_df, columns=feature_names)

df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,deck,embark_town,alive,alone,0,1,2,3,4,5
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,,Southampton,no,False,0.0,1.0,0.0,0.0,1.0,0.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,C,Cherbourg,yes,False,1.0,0.0,1.0,0.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,,Southampton,yes,True,1.0,0.0,0.0,0.0,1.0,0.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,C,Southampton,yes,False,1.0,0.0,0.0,0.0,1.0,0.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,,Southampton,no,True,0.0,1.0,0.0,0.0,1.0,0.0


# Ordinal encoding

In [13]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [14]:
ordinal_encoder=OrdinalEncoder()
ordinal_encoded_data = ordinal_encoder.fit_transform(df[['class']])
ordinal_encoded_data=pd.DataFrame(ordinal_encoded_data)

In [15]:
pd.concat([df, ordinal_encoded_data], axis=1).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,0
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,2.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,2.0


In [16]:
inverse_transformed_data = ordinal_encoder.inverse_transform(ordinal_encoded_data)
pd.DataFrame(inverse_transformed_data).head()

Unnamed: 0,0
0,Third
1,First
2,Third
3,First
4,Third


# Five importnat ways for Imputing Missing Values

You can impute missing values using machine learning models. This process is known as data imputation and is commonly used in data preprocessing to handle missing or incomplete data. There are several methods and models you can use, depending on the nature of your data and the missing values:

**Simple Imputation Techniques:**

Mean/Median Imputation: Replace missing values with the mean or median of the column. Suitable for numerical data.

**Mode Imputation:** Replace missing values with the mode (most frequent value) of the column. Useful for categorical data.

**K-Nearest Neighbors (KNN)**: This algorithm can be used to impute missing values based on the similarity of rows.

**Regression Imputation:** Use a regression model to predict the missing values based on other variables in your dataset.

**Decision Trees and Random Forests:** These can handle missing values inherently. They can also be used to predict missing values based on the patterns learned from the other data.

**Advanced Techniques:**

**Multiple Imputation by Chained Equations (MICE)**: This is a more sophisticated technique that models each variable with missing values as a function of other variables in a round-robin fashion.

**Deep Learning Methods**: Neural networks, especially autoencoders, can be effective in imputing missing values in complex datasets.
**Time Series Specific Methods:** For time-series data, you might use techniques like interpolation, forward-fill, or backward-fill.

It's important to choose the right method based on the type of data, the pattern of missingness (e.g., at random, completely at random, or not at random), and the amount of missing data. Additionally, it's crucial to understand that imputation can introduce bias or affect the distribution of your data, so it should be done with caution and an understanding of the potential implications.

**1.1. Mean/Median Imputation**

Mean/median imputation replaces missing values with the mean or median of the column. This is a simple and effective method, but it has some limitations. For example, it reduces variance in the dataset, and it can lead to biased estimates if the missing values are not missing at random.

Let's see how to implement mean/median imputation in Python using the Titanic dataset.

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# load the Titanic dataset
data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [18]:
# check the number of missing values in each column
data.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [19]:
# impute missing values with mean
data['age'] = data['age'].fillna(data['age'].mean())

# check the number of missing values in each column
data.isnull().sum().sort_values(ascending=False)

deck           688
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

**1.1.2. Median Imputation**

Let's load the dataset and replace the missing values in the age column with the median of the column:

In [20]:
df = sns.load_dataset('titanic')
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [21]:
# impute missing values with median
df['age'] = df['age'].fillna(df['age'].median())

# check the number of missing values in each column
df.isnull().sum().sort_values(ascending=False)

deck           688
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

**1.2. Mode Imputation**

Mode imputation replaces missing values with the mode (most frequent value) of the column. This is useful for imputing categorical columns, such as Embarked and embark_town in the Titanic dataset.

In [22]:
# load the dataset
df = sns.load_dataset('titanic')

# check the number of missing values in each column
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [23]:
# impute missing values with mode
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

# check the number of missing values in each column
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
embark_town      0
alive            0
alone            0
dtype: int64

**2. K-Nearest Neighbors (KNN)**
   
KNN is a machine learning algorithm that can be used for imputing missing values. It works by finding the most similar data points to the one with the missing value based on other available features. The missing value is then imputed with the mean or median of the most similar data points.

In [24]:
# load the dataset
df = sns.load_dataset('titanic')

# check the number of missing values in each column
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [25]:
from sklearn.impute import KNNImputer
imputer=KNNImputer(n_neighbors=4)
df["age"]=imputer.fit_transform(df[["age"]])

df.isnull().sum().sort_values(ascending=False)

deck           688
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

**3. Regression Imputation**

Regression imputation uses a regression model to predict the missing values based on other variables in the dataset. It works well for both categorical and numerical data.

In [26]:
# load the dataset
df = sns.load_dataset('titanic')

# check the number of missing values in each column
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [27]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer=IterativeImputer()
df["age"]=imputer.fit_transform(df[["age"]])
# check the number of missing values in each column
df.isnull().sum().sort_values(ascending=False)

deck           688
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

**4. Random Forests for Imputing Missing Values**
   
Random forests can handle missing values inherently. They can also be used to predict missing values based on the patterns learned from the other data.

In [30]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [32]:
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [141]:
df=sns.load_dataset('titanic')
df.drop("deck", axis=1, inplace=True)
df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [143]:
df.isnull().sum().sort_values(ascending=False)

age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [144]:
columns_to_encode =df[["sex","embarked","class","embark_town","who","alive"]]

In [145]:
Label_encoders={}
for col in columns_to_encode:
    le=LabelEncoder()
    df[col]=le.fit_transform(df[col])
    Label_encoders[col]=le

In [146]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,2,0,True


In [147]:
df_with_missing=df[df["age"].isna()]
df_without_missing=df[~df["age"].isna()]

In [148]:
print(f"The shape of missing values data is : ", df_with_missing.shape)
print(f"The shape of data without missing values is : ", df_without_missing.shape)

The shape of missing values data is :  (177, 14)
The shape of data without missing values is :  (714, 14)


In [149]:
df_without_missing.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,2,0,True


In [150]:
# Regression imputation

X = df_without_missing.drop(['age'], axis=1)
y = df_without_missing['age']

In [151]:
X.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,1,0,7.25,2,2,1,True,2,0,False
1,1,1,0,1,0,71.2833,0,0,2,False,0,1,False
2,1,3,0,0,0,7.925,2,2,2,False,2,1,True
3,1,1,0,1,0,53.1,2,0,2,False,2,1,False
4,0,3,1,0,0,8.05,2,2,1,True,2,0,True


In [152]:
# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [153]:
from sklearn.tree import DecisionTreeRegressor
# Create a regression model
reg_model = DecisionTreeRegressor()

# Fit the regression model
reg_model.fit(X_train, y_train)


In [154]:
# evaluate the model
y_pred = reg_model.predict(X_test)
print("RMSE for Random Forest Imputation: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score for Random Forest Imputation: ", r2_score(y_test, y_pred))
print("MAE for Random Forest Imputation: ", mean_absolute_error(y_test, y_pred))


RMSE for Random Forest Imputation:  12.068394207808263
R2 Score for Random Forest Imputation:  0.20934780826016208
MAE for Random Forest Imputation:  8.876615334505896


In [156]:
# Predict missing values
y_pred = reg_model.predict(df_with_missing.drop(['age'], axis=1))

In [157]:
# remove warning
import warnings
warnings.filterwarnings('ignore')

# replace the missing values with the predicted values
df_with_missing['age'] = y_pred



In [158]:
# check the missing values
df_with_missing.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [159]:
# concatenate the two dataframes
df_complete = pd.concat([df_with_missing, df_without_missing], axis=0)
# print the shape of the complete dataframe
print("The shape of the complete dataframe is: ", df_complete.shape)

#check the first 5 rows of the complete dataframe
df_complete.head()

The shape of the complete dataframe is:  (891, 14)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
5,0,3,1,21.0,0,0,8.4583,1,2,1,True,1,0,True
17,1,2,1,36.5,0,0,13.0,2,1,1,True,2,1,True
19,1,3,0,16.0,0,0,7.225,0,2,2,False,0,1,True
26,0,3,1,36.833333,0,0,7.225,0,2,1,True,0,0,True
28,1,3,0,19.0,0,0,7.8792,1,2,2,False,1,1,True


In [160]:
for col in columns_to_encode:
    # Retrieve the corresponding LabelEncoder for the column
    le = Label_encoders[col]
    # Inverse transform the data
    df_complete[col] = le.inverse_transform(df[col])  
# check the first 5 rows of the complete dataframe
df_complete.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
5,0,3,male,21.0,0,0,8.4583,S,Third,man,True,Southampton,no,True
17,1,2,female,36.5,0,0,13.0,C,First,woman,True,Cherbourg,yes,True
19,1,3,female,16.0,0,0,7.225,S,Third,woman,False,Southampton,yes,True
26,0,3,female,36.833333,0,0,7.225,S,First,woman,True,Southampton,yes,True
28,1,3,male,19.0,0,0,7.8792,S,Third,man,False,Southampton,no,True


In [None]:
df_complete.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,1,21.0,0,0,8.4583,2,2,1,True,,2,0,True
17,1,2,0,36.5,0,0,13.0,0,0,2,True,,0,1,True
19,1,3,0,16.0,0,0,7.225,2,2,2,False,,2,1,True
26,0,3,0,36.833333,0,0,7.225,2,0,2,True,,2,1,True
28,1,3,1,19.0,0,0,7.8792,2,2,1,False,,2,0,True


# 5. Advanced Techniques

**5.1. Multiple Imputation by Chained Equations (MICE)**

Multiple Imputation by Chained Equations (MICE) is a more sophisticated technique that models each variable with missing values as a function of other variables in a round-robin fashion. It works well for both categorical and numerical data.

To demonstrate Multiple Imputation by Chained Equations (MICE) in Python, we can use the IterativeImputer class from the sklearn.impute module. MICE is a sophisticated method of imputation that models each feature with missing values as a function of other features, and it uses that estimate for imputation. It does this in a round-robin fashion: each feature is modeled in turn. The MICE algorithm is implemented in the IterativeImputer class.

In [161]:
# imoprt libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# laod the dataset
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [162]:
# check the missing values
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [163]:
from sklearn.preprocessing import LabelEncoder

# create a LabelEncoder object using LabelEncoder() in for loop for categorical columns
# Columns to encode
columns_to_encode = ['sex', 'embarked', 'who', 'deck', 'class', 'embark_town', 'alive']

# Dictionary to store LabelEncoders for each column
label_encoders = {}

# Loop to apply LabelEncoder to each column for encoding
for col in columns_to_encode:
    # Create a new LabelEncoder for the column
    le = LabelEncoder()
    # Fit and transform the data
    df[col] = le.fit_transform(df[col])
    # Store the encoder in the dictionary
    label_encoders[col] = le
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,7,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,2,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,7,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,7,2,0,True


In [164]:
# impute the missing values with IterativeImputer
# call the IterativeImputer class with max_iter = 10
imputer = IterativeImputer(max_iter=10)

#impute missing values using IterativeImputer in a for loop for age, embark_town,embarked columns and deck

# Columns to impute
columns_to_impute = ['age', 'embark_town', 'embarked', 'deck']

# Loop to impute each column
for col in columns_to_impute:
    df[col] = imputer.fit_transform(df[[col]])    
# check the missing values
df.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [165]:
# Inverse transform for encoded columns
for col in columns_to_encode:
    # Retrieve the corresponding LabelEncoder for the column
    le = label_encoders[col]
    # Inverse transform the data and convert to integer type
    df[col] = le.inverse_transform(df[col].astype(int))

df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Important Models

| **Category**                          | **Algorithms**                                                                                                                                    |
|---------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
| **Supervised Learning**               | Linear Regression, Multiple Linear Regression, Polynomial Regression, Ridge Regression, Lasso Regression, Elastic Net, Logistic Regression, Decision Trees, Random Forest, Gradient Boosting, XGBoost, LightGBM, CatBoost, Naive Bayes, Support Vector Machines (SVM), k-Nearest Neighbors (k-NN), Linear Discriminant Analysis, Quadratic Discriminant Analysis |
| **Unsupervised Learning**             | K-Means Clustering, Hierarchical Clustering, DBSCAN, Principal Component Analysis (PCA), Singular Value Decomposition (SVD), Gaussian Mixture Models, Self-Organizing Maps |
| **Reinforcement Learning**            | Q-Learning, SARSA (State-Action-Reward-State-Action), Deep Q Network (DQN), Policy Gradients, Actor-Critic Methods, Monte Carlo Methods |
| **Neural Networks and Deep Learning** | Perceptron, Multi-layer Perceptrons (MLP), Convolutional Neural Networks (CNNs), Recurrent Neural Networks (RNNs), Long Short-Term Memory Networks (LSTMs), Generative Adversarial Networks (GANs), Transformer Networks, Autoencoders |
| **Ensemble Methods**                  | Bagging, Boosting, Stacking, Random Forest, Gradient Boosting Machines (GBM), AdaBoost, XGBoost, LightGBM, CatBoost |
| **Dimensionality Reduction**          | Principal Component Analysis (PCA), Linear Discriminant Analysis (LDA), t-Distributed Stochastic Neighbor Embedding (t-SNE), Uniform Manifold Approximation and Projection (UMAP) |