# IMPORT LIBS

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# IMPORT DATASET

In [32]:
train_set_train = pd.read_csv('../data/cleaned/df_train_cleaned.csv')
train_set_test = pd.read_csv('../data/cleaned/df_test_cleaned.csv')
test_set = pd.read_csv('../data/cleaned/test_cleaned.csv')

In [33]:
print("Train set shape:", train_set_train.shape)
print("Test Train set shape:", train_set_test.shape)
print("Test set shape:", test_set.shape)

Train set shape: (934, 75)
Test Train set shape: (234, 75)
Test set shape: (1459, 74)


In [34]:
def separate_features_by_type(df):
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
    return numerical_cols, categorical_cols

In [35]:
# MSSubClass represent a categorical feature but was encoded as numerical, we will leave as so
num_train,cat_train = separate_features_by_type(train_set_train)
num_test,cat_test = separate_features_by_type(train_set_test)
num, cat = separate_features_by_type(test_set)

In [36]:
print(len(num_train), "numerical features in train set")
print(len(cat_train), "categorical features in train set")
print(len(num_test), "numerical features in test set")
print(len(cat_test), "categorical features in test set")
print(len(num), "numerical features in test set")
print(len(cat), "categorical features in test set")

38 numerical features in train set
37 categorical features in train set
38 numerical features in test set
37 categorical features in test set
37 numerical features in test set
37 categorical features in test set


In [None]:
train_set_train[num_train]

In [38]:
def count_zeros_in_columns(df):
    zero_counts = {}
    for col in df.columns:
        if (df[col] == 0).sum() > 0:
            zero_counts[col] = (df[col] == 0).sum()
    
    return zero_counts

In [None]:
# Assuming you've already defined count_zeros_in_columns()

# Count zeros in each dataset
zero_columns_testset = count_zeros_in_columns(test_set[num])
zero_columns_train = count_zeros_in_columns(train_set_train[num_train])
zero_columns_test = count_zeros_in_columns(train_set_test[num_test])

# Print results with labels
print("🔍 Zero Values in Test Set:")
for col, count in zero_columns_testset.items():
    print(f"{col}: {count}")

print("\n🔍 Zero Values in Train Set (Train Split):")
for col, count in zero_columns_train.items():
    print(f"{col}: {count}")

print("\n🔍 Zero Values in Train Set (Test Split):")
for col, count in zero_columns_test.items():
    print(f"{col}: {count}")


In [None]:
# Step 2: Define the threshold (70% of total rows)
threshold = 0.7 * len(train_set_train)

# Step 3: Find columns with more than 70% zeros
high_zero_cols = [col for col, count in zero_columns_train.items() if count > threshold]

# Step 4: Drop these columns from the training DataFrame
train_set_train.drop(columns=high_zero_cols, inplace=True)
train_set_test.drop(columns=high_zero_cols, inplace=True)
test_set.drop(columns=high_zero_cols, inplace=True)

# Step 5: Remove (pop) those columns from num_train list
# Applying to other dataset too
num_train = [col for col in num_train if col not in high_zero_cols]
num_test = [col for col in num_train if col not in high_zero_cols]
num  = [col for col in num_train if col not in high_zero_cols]



Columns with >70% zeros removed: ['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
New shape of train_set_train: (934, 67)
Updated num_train list length: 30


In [47]:
def output_result() : 
    # Output results
    print("Columns with >70% zeros removed:", high_zero_cols)
    print("New shape of train_set_train:", train_set_train.shape)
    print("New shape of train_set_test:", train_set_test.shape)
    print("New shape of test_set:", test_set.shape)
    print("Updated num_train list length:", len(num_train))
    print("Updated num_test list length:", len(num_test))
    print("Updated num list length:", len(num))

In [48]:
output_result()

Columns with >70% zeros removed: ['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
New shape of train_set_train: (934, 67)
New shape of train_set_test: (234, 67)
New shape of test_set: (1459, 66)
Updated num_train list length: 30
Updated num_test list length: 30
Updated num list length: 30


### Dropping Strongly Correlated Pairs
We discovered them from eda

In [45]:
strongly_correlated_pairs = [
    ("GarageCars", "GarageArea"),
    ("TotalBsmtSF", "1stFlrSF"),
    ("GrLivArea", "TotRmsAbvGrd"),
    ("YearBuilt", "GarageYrBlt"),
]

In [46]:
def drop_strongly_correlated(df, strongly_correlated_pairs, num_train):
    for col1, col2 in strongly_correlated_pairs:
        if col1 in df.columns and col2 in df.columns:
            print(f"Dropping '{col2}' because it's strongly correlated with '{col1}'")
            df.drop(columns=[col2], inplace=True)
            if col2 in num_train:
                num_train.remove(col2)
    return df, num_train


In [49]:
train_set_train, num_train = drop_strongly_correlated(train_set_train, strongly_correlated_pairs, num_train)
train_set_test, num_test = drop_strongly_correlated(train_set_test, strongly_correlated_pairs, num_test)
test_set, num = drop_strongly_correlated(test_set, strongly_correlated_pairs, num)

Dropping 'GarageArea' because it's strongly correlated with 'GarageCars'
Dropping '1stFlrSF' because it's strongly correlated with 'TotalBsmtSF'
Dropping 'TotRmsAbvGrd' because it's strongly correlated with 'GrLivArea'
Dropping 'GarageYrBlt' because it's strongly correlated with 'YearBuilt'
Dropping 'GarageArea' because it's strongly correlated with 'GarageCars'
Dropping '1stFlrSF' because it's strongly correlated with 'TotalBsmtSF'
Dropping 'TotRmsAbvGrd' because it's strongly correlated with 'GrLivArea'
Dropping 'GarageYrBlt' because it's strongly correlated with 'YearBuilt'
Dropping 'GarageArea' because it's strongly correlated with 'GarageCars'
Dropping '1stFlrSF' because it's strongly correlated with 'TotalBsmtSF'
Dropping 'TotRmsAbvGrd' because it's strongly correlated with 'GrLivArea'
Dropping 'GarageYrBlt' because it's strongly correlated with 'YearBuilt'


In [50]:
output_result()

Columns with >70% zeros removed: ['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
New shape of train_set_train: (934, 63)
New shape of train_set_test: (234, 63)
New shape of test_set: (1459, 62)
Updated num_train list length: 26
Updated num_test list length: 26
Updated num list length: 26


In [61]:
train_set_train.to_csv("../data/feature_eng/train.csv", index=False)
train_set_test.to_csv("../data/feature_eng/test.csv", index=False)
test_set.to_csv("../data/feature_eng/sub.csv", index=False)