In [None]:
# To Hndle The Data
import pandas as pd
import numpy as np

# To Visualize the data
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
import seaborn as sns
%matplotlib inline

# To Hnadle Warnings during the execution of the code
import warnings
warnings.filterwarnings("ignore")


# Importing the dataset for analysis


In [None]:
data =pd.read_csv("../data/raw/train.csv")
data.head(5)

# Exploratory Data Analysis

In [None]:
# check the shape of the dataset i.e number of rows and columns
data.shape

In [None]:
# exploratory data analysis, giving an overview of the dataset i.e the number of rows, columns, data types and non-null values
data.info()

In [None]:
# listing the columns in the dataset
data.columns

In [None]:
data.drop(columns=["Id"], inplace=True)

In [None]:


# calculate the percentage of missing value per column in the dataset
missing_percentage = data.isnull().mean() * 100

# Optional: Display only columns with missing values
missing_percentage = missing_percentage[missing_percentage > 0]

# Print the result
print(missing_percentage.sort_values(ascending=False))


In [None]:
# visualizing the percentage of missing values per column
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_percentage.values, y=missing_percentage.index, palette="viridis")
plt.xlabel("Percentage of Missing Values")
plt.ylabel("Columns")
plt.title("Missing Value Percentage per Column")
plt.tight_layout()
plt.show()

## observation from the above information
* - The dataset contains 81 columns and 1460 rows
* - There are 35 integers, 3 floats and 43 objects in the dataset
* - Columns ike MiscFeature,Fence,PoolQC,FireplaceQU, Alley, MasVnrType, LotFrontage, have high NAN percentage
* - Important numerical features like LotFrontage, GarageYrBlt, and MasVnrArea also contain some missing values.
* - Id is just a unique identifier and not useful for analysis.
* - Some columns have incorrect data types.
* - OverallQual, OverallCond are ordinal variables, but currently treated as integers.
* - MSSubClass represents building category, so it should be categorical, not integer

<h2>Analysing and cleaning  each columns of the dataset</h2>

In [None]:
data.describe().T

In [None]:
skewness = data.select_dtypes(include="number").skew().sort_values(ascending=False)

In [None]:
# visualizing the skewness of numerical features

plt.figure(figsize=(10, 6))
sns.barplot(x=skewness.values, y=skewness.index, palette='coolwarm')
plt.title("Skewness of Numerical Features")
plt.xlabel("Skewness")
plt.ylabel("Features")
plt.axvline(x=0, color='black', linestyle='--')  # vertical line at 0 for reference
plt.tight_layout()
plt.show()

### Categorical Analysis

In [None]:
# Separate columns
# Categorical columns: object and category types
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Numerical columns: int and float types
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()


# Subset data
data_categorical = data[categorical_cols]
data_numerical = data[numerical_cols]


In [None]:
missing_categorical = data_categorical.isnull().sum()


print("Missing value in categorical columns: \n", missing_categorical[missing_categorical > 0])

In [None]:
# percentage of missing values in categorical columns
# Calculate total number of rows
total_rows = len(data_categorical)

# Compute percentage
missing_categorical_percent = (missing_categorical / total_rows) * 100

# Display result
print("Missing percentage in categorical columns:\n")
print(missing_categorical_percent.sort_values(ascending=False).round(2))

In [None]:
# Get a colormap and normalize values
cmap = get_cmap("plasma")  # You can change this to 'viridis', 'magma', etc.
norm = plt.Normalize(missing_categorical_percent.min(), missing_categorical_percent.max())
colors = [cmap(norm(val)) for val in missing_categorical_percent.values]

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(
    x=missing_categorical_percent.values,
    y=missing_categorical_percent.index,
    palette=colors  # Apply gradient colors
)

plt.xlabel("Percentage of Missing Values")
plt.ylabel("Categorical Columns")
plt.title("Missing Value Percentage in Categorical Columns")
plt.xlim(0, 100)
plt.tight_layout()
plt.show()

In [None]:
categorical_cols_to_drop = [
    "MiscFeature", "Fence", "PoolQC", "FireplaceQu", "Alley"
]

data.drop(columns=categorical_cols_to_drop, inplace=True)

In [None]:
data.columns

In [None]:
def update_categorical_data(data):
    return data.select_dtypes(include=["object", "category"]).copy()

In [None]:
data_categorical.columns

In [None]:
data_categorical["MasVnrType"].isnull().sum()

In [None]:
data_categorical["MasVnrType"].unique()

In [None]:
# Note: the MasVnrType column have a high number of missing value and the amount of unique categories is low, we replace the missing values with None
# Note: replace the msising values with 'None', simply means the house has no mansory veneer
data["MasVnrType"] = data["MasVnrType"].fillna("None")

In [None]:
data_categorical["Electrical"].isnull().sum(
)

In [None]:
data_categorical["Electrical"].unique()

In [None]:
data["Electrical"].fillna(data["Electrical"].mode()[0], inplace=True)

In [None]:
garage_cols = [
    "GarageType", "GarageFinish", "GarageQual", "GarageCond"
]

for col in garage_cols:
    data[col].fillna("None", inplace = True)

In [None]:
Bsmt_cols = [
    "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"
]

for col2 in Bsmt_cols:
    data[col2].fillna("None", inplace= True)

In [None]:
# the function i created erlier contain only objects
# all the updated upject that are in the main data are copied to the data category
data_categorical = update_categorical_data(data)

In [None]:
data_categorical.columns

In [None]:
data_categorical.isnull().sum()

In [None]:
# this shows that the missing values in the categorical columns have been handled
# and there are no more missing values in the categorical columns
# and this also shows that the data_categorical dataframe is updated into the main data
data.select_dtypes(include=["object", "category"]).isnull().sum()

In [None]:
# visualization of the null value of the categorical columns using heatmap 

In [None]:
# Visualize missing values in categorical columns using a heatmap
plt.figure(figsize=(10, 4))
sns.heatmap(data_categorical.isnull(), cbar=False, cmap="viridis", yticklabels=False)
plt.title("Missing Values in Categorical Columns (Heatmap)")
plt.xlabel("Categorical Columns")
plt.show()

### numerical column

In [None]:
data_numerical.isnull().sum()

In [None]:
data_numerical.info()

In [40]:
# List columns with at least one null value
columns_with_null = data_numerical.columns[data_numerical.isnull().any()].tolist()

# Print the result
print("Columns with missing values:")
print(columns_with_null)


Columns with missing values:
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


In [None]:
# Get columns with missing values
columns_with_null = data_numerical.columns[data_numerical.isnull().any()]

# Set up unique colors (generate a color palette)
colors = sns.color_palette("husl", len(columns_with_null))  # You can change "husl" to other palettes

# Plot each boxplot in a loop
for col, color in zip(columns_with_null, colors):
    plt.figure(figsize=(4, 4))
    sns.boxplot(y=data_numerical[col], color=color)
    plt.title(f"Boxplot of {col} (with missing values)")
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()


In [None]:
for col in columns_with_null:
    print(f"Column {col}")