# **Data Cleaning Notebook**

### Objectives

* Evaluate missing data.
* Clean data.

### Inputs

* inputs/datasets/raw/house-price-20211124T154130Z-001/house-price/house_prices_records.csv
* inputs/datasets/raw/house-price-20211124T154130Z-001/house-price/inherited_houses.csv

### Outputs

* Train set: outputs/datasets/cleaned/TrainSetCleaned.csv 
* Test set: outputs/datasets/cleaned/TestSetCleaned.csv

### Conclusions 

* Data Cleaning Pipeline.

### Additional Comments

* This file and its contents were inspired by and adapted from the Churnometer Walkthrough Project 2.  

---

### Change working directory

* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory

* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

---

### Load Collected Data

In [None]:
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/house-prices.csv")
    )
df.head(3)

### Data Exploration

* Identifying Columns with Missing Data:

In [None]:
vars_with_missing_data = df.columns[df.isna().sum() > 0].to_list()
vars_with_missing_data

In [None]:
from ydata_profiling import ProfileReport
if vars_with_missing_data:
    profile = ProfileReport(df=df[vars_with_missing_data], minimal=True)
    profile.to_notebook_iframe()
else:
    print("There are no variables with missing data")

### Correlation and PPS Analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ppscore as pps
%matplotlib inline

# Function to generate a heatmap based on correlation

def heatmap_corr(df, threshold, figsize=(20, 12), font_annot=8):
    if len(df.columns) > 1:
        mask = np.zeros_like(df, dtype=bool)  # np.bool is deprecated, use bool
        mask[np.triu_indices_from(mask)] = True
        mask[abs(df) < threshold] = True

        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(df, annot=True, xticklabels=True, yticklabels=True,
                    mask=mask, cmap='viridis', annot_kws={"size": font_annot}, ax=ax,
                    linewidth=0.5)
        ax.set_yticklabels(df.columns, rotation=0)
        plt.ylim(len(df.columns), 0)
        plt.show()

# Function to generate a heatmap based on PPS

def heatmap_pps(df, threshold, figsize=(20, 12), font_annot=8):
    if len(df.columns) > 1:
        mask = np.zeros_like(df, dtype=bool)  # np.bool is deprecated, use bool
        mask[abs(df) < threshold] = True
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(df, annot=True, xticklabels=True, yticklabels=True,
                    mask=mask, cmap='rocket_r', annot_kws={"size": font_annot},
                    linewidth=0.05, linecolor='grey')
        plt.ylim(len(df.columns), 0)
        plt.show()

# Function to calculate both correlation and PPS (Predictive Power Score)

def CalculateCorrAndPPS(df):
    # Calculate Spearman and Pearson correlations
    df_corr_spearman = df.corr(method="spearman")
    df_corr_pearson = df.corr(method="pearson")

    # Calculate PPS matrix
    pps_matrix_raw = pps.matrix(df)
    pps_matrix = pps_matrix_raw.filter(['x', 'y', 'ppscore']).pivot(columns='x', index='y', values='ppscore')

    # Calculate PPS score statistics to decide threshold
    pps_score_stats = pps_matrix_raw.query("ppscore < 1").filter(['ppscore']).describe().T
    print("PPS threshold - check PPS score IQR to decide threshold for heatmap \n")
    print(pps_score_stats.round(3))

    return df_corr_pearson, df_corr_spearman, pps_matrix

# Function to display all three heatmaps: Spearman, Pearson, PPS

def DisplayCorrAndPPS(df_corr_pearson, df_corr_spearman, pps_matrix, CorrThreshold, PPS_Threshold,
                      figsize=(20, 12), font_annot=8):

    print("\n")
    print("* Analyse how the target variable for your ML models are correlated with other variables (features and target)")
    print("* Analyse multi-colinearity, that is, how the features are correlated among themselves")

    print("\n")
    print("*** Heatmap: Spearman Correlation ***")
    print("It evaluates monotonic relationship \n")
    heatmap_corr(df=df_corr_spearman, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

    print("\n")
    print("*** Heatmap: Pearson Correlation ***")
    print("It evaluates the linear relationship between two continuous variables \n")
    heatmap_corr(df=df_corr_pearson, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

    print("\n")
    print("*** Heatmap: Power Predictive Score (PPS) ***")
    print(f"PPS detects linear or non-linear relationships between two columns.\n"
          f"The score ranges from 0 (no predictive power) to 1 (perfect predictive power) \n")
    heatmap_pps(df=pps_matrix, threshold=PPS_Threshold, figsize=figsize, font_annot=font_annot)



We first identify all categorical columns in the dataset using df.select_dtypes() and print them out. Then, we apply pd.get_dummies() to convert specific categorical columns ('BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual') into numerical one-hot encoded columns, while dropping the first category to avoid the dummy variable trap.

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
print(categorical_cols)

df_encoded = pd.get_dummies(df, columns=['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual'], drop_first=True)


Calculate Correlations and Power Predictive Score

In [None]:
df_corr_pearson, df_corr_spearman, pps_matrix = CalculateCorrAndPPS(df_encoded)

Display at Heatmaps

In [None]:
# Display the correlation and PPS heatmaps
DisplayCorrAndPPS(df_corr_pearson=df_corr_pearson,
                  df_corr_spearman=df_corr_spearman,
                  pps_matrix=pps_matrix,
                  CorrThreshold=0.4,  # The threshold to filter the correlations displayed in the heatmap
                  PPS_Threshold=0.2,   # The threshold for the PPS score displayed in the heatmap
                  figsize=(12, 10),    # Set the figure size for the heatmaps
                  font_annot=10)       # Set the font size for the annotations in the heatmaps


### Data Cleaning

### Assessing Missing Data Levels

* Custom function to display missing data levels in a DataFrame, it shows the absolute levels, relative levels and data type.

In [None]:
def EvaluateMissingData(df):
    missing_data_absolute = df.isnull().sum()
    missing_data_percentage = round(missing_data_absolute/len(df)*100, 2)
    df_missing_data = (pd.DataFrame(
                            data={"RowsWithMissingData": missing_data_absolute,
                                   "PercentageOfDataset": missing_data_percentage,
                                   "DataType": df.dtypes}
                                    )
                          .sort_values(by=['PercentageOfDataset'], ascending=False)
                          .query("PercentageOfDataset > 0")
                          )

    return df_missing_data

Check missing data levels for the collected dataset.

In [24]:
EvaluateMissingData(df)

Unnamed: 0,RowsWithMissingData,PercentageOfDataset,DataType
EnclosedPorch,1324,90.68,float64
WoodDeckSF,1305,89.38,float64
LotFrontage,259,17.74,float64
GarageFinish,235,16.1,object
BsmtFinType1,145,9.93,object
BedroomAbvGr,99,6.78,float64
2ndFlrSF,86,5.89,float64
GarageYrBlt,81,5.55,float64
BsmtExposure,38,2.6,object
MasVnrArea,8,0.55,float64


### Data Cleaning Spreadsheet Summary

| Column          | Type of Cleaning                | Description of Action                                                   | Justification                          |
|-----------------|---------------------------------|-------------------------------------------------------------------------|----------------------------------------|
| `Working Directory` | Change Working Directory     | Set current directory to project root                                  | Ensure all file paths are relative and consistent |
| `house-prices.csv`  | Load Data                    | Loaded data from CSV using Pandas                                      | Begin data exploration and processing  |
| `Missing Values`    | Identify Missing Values      | Identified columns with missing values using `.isna()`                 | Understand data quality and plan for handling missing data |
| `BsmtExposure`, `BsmtFinType1`, `GarageFinish`, `KitchenQual` | One-Hot Encoding | Converted categorical columns to numerical representation using `pd.get_dummies()` | Machine Learning models require numerical data  |
| `Pearson and Spearman Correlation` | Calculate Correlations | Used `.corr()` to calculate Pearson and Spearman correlations          | Understand relationships between features for feature selection |
| `PPS Matrix`     | Calculate PPS                  | Used `ppscore.matrix()` to evaluate predictive power between features  | Detect both linear and non-linear relationships |


### Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split
TrainSet, TestSet, _, __ = train_test_split(
                                        df,
                                        df['SalePrice'],
                                        test_size=0.2,
                                        random_state=0)

print(f"TrainSet shape: {TrainSet.shape} \nTestSet shape: {TestSet.shape}")

In [None]:
df_missing_data = EvaluateMissingData(TrainSet)
print(f"* There are {df_missing_data.shape[0]} variables with missing data \n")
df_missing_data

### Drop Variables

First we identify Variables with more than 80% missing data:

In [None]:
threshold = 0.8
missing_fraction = TrainSet.isna().mean()
variables_to_drop = missing_fraction[missing_fraction > threshold].index.tolist()

print(f"Variables with more than 80% missing data: {variables_to_drop}")

### Dropping Variables with High Missing Values
After analyzing the dataset, we found that the following variables had more than 80% missing data:
- `EnclosedPorch`
- `WoodDeckSF`

These variables are unlikely to add significant value to our model due to the high proportion of missing data. Therefore, we decided to drop them from both the training and test datasets.

We then re-evaluated the dataset to check if any variables still have missing data and will proceed accordingly.

In [None]:
variables_to_drop = ['EnclosedPorch', 'WoodDeckSF']

TrainSet = TrainSet.drop(columns=variables_to_drop)
TestSet = TestSet.drop(columns=variables_to_drop)

print(f"* {len(variables_to_drop)} variables have been dropped: {variables_to_drop}")

df_missing_data = TrainSet.isna().sum()
print(f"* There are still {df_missing_data[df_missing_data > 0].shape[0]} variables with missing data \n")

if df_missing_data[df_missing_data > 0].shape[0] > 0:
    print("Remaining variables with missing data:\n", df_missing_data[df_missing_data > 0])
else:
    print("No variables with missing data remaining.")

### Handling Missing Data
After dropping columns with over 80% missing values, we still identified several columns with missing data:
- Numerical Columns: `2ndFlrSF`, `BedroomAbvGr`, `GarageYrBlt`, `LotFrontage`, `MasVnrArea`
- Categorical Columns: `BsmtExposure`, `BsmtFinType1`, `GarageFinish`

To handle the missing data, we decided to:
- Impute numerical columns using the median value, as it is less affected by outliers compared to the mean.
- Impute categorical columns using the most frequent value (mode), ensuring that the data remains consistent.

After performing imputations, we rechecked for missing data to ensure no missing values remain.


In [None]:
from sklearn.impute import SimpleImputer

numerical_cols = ['2ndFlrSF', 'BedroomAbvGr', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea']
num_imputer = SimpleImputer(strategy='median')
TrainSet[numerical_cols] = num_imputer.fit_transform(TrainSet[numerical_cols])
TestSet[numerical_cols] = num_imputer.transform(TestSet[numerical_cols])


categorical_cols = ['BsmtExposure', 'BsmtFinType1', 'GarageFinish']
cat_imputer = SimpleImputer(strategy='most_frequent')
TrainSet[categorical_cols] = cat_imputer.fit_transform(TrainSet[categorical_cols])
TestSet[categorical_cols] = cat_imputer.transform(TestSet[categorical_cols])


missing_data_after_imputation = TrainSet.isna().sum()
print(f"* After imputation, there are still {missing_data_after_imputation[missing_data_after_imputation > 0].shape[0]} variables with missing data \n")

if missing_data_after_imputation[missing_data_after_imputation > 0].shape[0] > 0:
    print("Remaining variables with missing data:\n", missing_data_after_imputation[missing_data_after_imputation > 0])
else:
    print("No variables with missing data remaining.")


### Push cleaned data to Repo

In [None]:
import os
try:
  os.makedirs(name='outputs/datasets/cleaned') # create outputs/datasets/collection folder
except Exception as e:
  print(e)

### Train Set

In [None]:
TrainSet.to_csv("outputs/datasets/cleaned/TrainSetCleaned.csv", index=False)

### Test Set

In [None]:
TestSet.to_csv("outputs/datasets/cleaned/TestSetCleaned.csv", index=False)