#### Import Libraries and Load Data

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
plt.style.use('default')
sns.set_palette("Blues_r")
%matplotlib inline

# Display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

#### Load the Data

In [None]:
# Load the training and test datasets
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

#### Show data

In [None]:
print('First 5 rows of training data')
train_df.head()

#### Basic Dataset Information

In [None]:
# Get basic information about the dataset
print("Dataset Info:")
print(f"Number of features: {len(train_df.columns)}")
print(f"Number of training samples: {len(train_df)}")
print(f"Number of test samples: {len(test_df)}")

print("\nColumn names:")
print(train_df.columns.tolist())

#### Analyze the Target Variable (SalePrice)

In [None]:
# Describe the target variable (SalePrice)
print("Description of SalePrice:")
print(train_df['SalePrice'].describe())

# Visualize the distribution of SalePrice
plt.figure(figsize=(10, 6))
sns.histplot(train_df['SalePrice'], kde=True, bins=50)
plt.title('Distribution of SalePrice')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.show()

The SalePrice is heavily right-skewed (positively skewed).

* What the data tells us:
  
      Mean ($180,921) > Median ($163,000): Classic sign of right skew
      Long tail: Few very expensive houses (up to $755,000) pulling the mean higher
      Most houses: Clustered in the $100,000-$250,000 range
      Outliers: Several houses above $400,000

* Why this matters for machine learning:

      Model performance: Many algorithms assume normal distributions
      Outlier sensitivity: Linear models can be heavily influenced by those expensive houses
      Prediction accuracy: Skewed targets often lead to biased predictions



* Next steps we'll need to consider:

        Log transformation: log(SalePrice) often makes price data more normal
        Outlier analysis: Investigate those very expensive houses
        Feature engineering: Create features that capture price ranges

In [None]:
# Check for missing values in the training data
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Display missing values
print("Missing values in training data:")
print(missing_values)

# Visualize the missing values
plt.figure(figsize=(12, 6))
missing_values.plot(kind='bar')
plt.title('Features with Missing Values')
plt.xlabel('Features')
plt.ylabel('Number of Missing Values')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

* Key insight: Most of these missing values indicate absence of features:


      PoolQC missing = No pool
      Alley missing = No alley access
      Fence missing = No fence
      FireplaceQu missing = No fireplace
  
* Notice the patterns:

      Garage features (81 missing each) - same houses missing ALL garage info = No Garage
      Basement features (37-38 missing each) - same houses missing ALL basement info = No Basement

* This is feature engineering gold! Instead of treating these as problems, we can:

      Fill missing values with meaningful categories like "None" or "No Pool"
      Create binary features like "HasPool", "HasGarage", "HasBasement"
      Use the absence/presence as predictive features

#### Look for Patterns in Missing Counts

In [None]:
# grouped Missing Values
# some features have EXACTLY the same number of missing values
garage_features = ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']
basement_features = ['BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual']

In [None]:
# Investigate if Missing Values Occur Together
# Are the same houses missing all garage features?
garage_missing = train_df[garage_features].isnull()
same_houses_missing_garage = garage_missing.all(axis=1).sum()
any_house_missing_garage = garage_missing.any(axis=1).sum()

print(f"Houses missing ALL garage features: {same_houses_missing_garage}")
print(f"Houses missing ANY garage feature: {any_house_missing_garage}")

# If these numbers are the same, it means missing values always occur together!

In [None]:
# Are the same houses missing all garage features?
basement_missing = train_df[basement_features].isnull()
same_houses_missing_basement = basement_missing.all(axis=1).sum()
any_house_missing_basement = basement_missing.any(axis=1).sum()

print(f"Houses missing ALL basement features: {same_houses_missing_basement}")
print(f"Houses missing ANY basement feature: {any_house_missing_basement}")

# If these numbers are the same, it means missing values always occur together!

In [None]:
# Extremely High Missing Percentages
# Calculate missing percentages
missing_pct = (train_df.isnull().sum() / len(train_df)) * 100
high_missing = missing_pct[missing_pct > 50]
print("Features missing >50% of values:")
print(high_missing.sort_values(ascending=False))

In [None]:
# For features with many missing values, what are the actual values?
high_missing_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']

for feature in high_missing_features:
    print(f"\n{feature} - Unique values:")
    print(train_df[feature].value_counts(dropna=False))
    print(f"Missing: {train_df[feature].isnull().sum()}/{len(train_df)} ({train_df[feature].isnull().sum()/len(train_df)*100:.1f}%)")

In [None]:
# Pool investigation
pool_df = train_df[train_df["PoolQC"].isnull() == True]
print(f"PoolArea when PoolQC is null: {pool_df['PoolArea'].describe()}")

# Misc Features  investigation
missing_misc = train_df[train_df['MiscFeature'].isnull()]
print(f"\nMiscVal when MiscFeature missing: {missing_misc['MiscVal'].describe()}")

# Garage investigation  
garage_df = train_df[train_df[garage_features].isnull().any(axis=1) == True]
print(f"\nGarageCars when garage_features is null: {garage_df['GarageCars'].describe()}")
print(f"GarageArea when garage_features is null: {garage_df['GarageArea'].describe()}")

# Basement investigation  
basement_df = train_df[train_df[basement_features].isnull().any(axis=1) == True]
print(f"\nBsmtUnfSF when basement_features is null: {basement_df['BsmtUnfSF'].describe()}")
print(f"BsmtFinSF1 when basement_features is null: {basement_df['BsmtFinSF1'].describe()}")
print(f"BsmtFinSF2 when basement_features is null: {basement_df['BsmtFinSF2'].describe()}")

# Fireplace investigation
fireplace_df = train_df[train_df["FireplaceQu"].isnull() == True]
print(f"\nFireplaces when FireplaceQu is null: {fireplace_df['Fireplaces'].describe()}")

# Masonery investigation
masonery_df = train_df[train_df['MasVnrType'].isnull() == True]
# print(f"\nMasonery Area when MasVnrType is null: {masonery_df["MasVnrArea"].describe()}")
print(f"\nMasonery Area when MasVnrType is null: {masonery_df['MasVnrArea'].describe()}")

In [None]:
# missing values in any basement feature
cond_missing = train_df[basement_features].isnull().any(axis=1)
all_basement_features = ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF"]

# at least one of the area features > 0
cond_area = (
    (train_df["BsmtFinSF1"] > 0) |
    (train_df["BsmtFinSF2"] > 0) |
    (train_df["BsmtUnfSF"] > 0)
)

# Combine both
filtered = train_df[cond_missing & cond_area][all_basement_features]
filtered

* CONFIRMED (Sum = 0):

        Pool: Missing PoolQC → PoolArea = 0 → No Pool
        Garage: Missing garage features → GarageCars + GarageArea = 0 → No Garage
        Fireplace: Missing FireplaceQu → Fireplaces = 0 → No Fireplace
        Misc: Missing MiscFeature → MiscVal = 0 → No Misc Feature

* ANOMALY DETECTED - Basements:

        Missing basement quality features BUT still have basement square footage!
        BsmtUnfSF = 2,539 sq ft (not zero!)
        This suggests houses have basements but missing quality ratings

In [None]:
# Masonry; Find the anomaly cases
masonry_anomaly = train_df[
    (train_df['MasVnrType'].isnull()) & 
    (train_df['MasVnrArea'] > 0)
]
print(f"Houses with missing type but area > 0: {len(masonry_anomaly)}")
print("Their areas:", masonry_anomaly['MasVnrArea'].tolist())

#### "None" Filling

In [None]:
# Define feature groups
none_features = [
    'PoolQC', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'FireplaceQu', 'MiscFeature', 'Alley', 'Fence'
]

# Fill categorical features with "None"
for feature in none_features:
    before_count = train_df[feature].isnull().sum()
    train_df[feature] = train_df[feature].fillna('None')
    after_count = train_df[feature].isnull().sum()
    print(f"{feature}: {before_count} → {after_count} missing values")

# Handle numerical garage year
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(0)

# Verify results
print(f"\nTotal missing values in 'None' features: {train_df[none_features].isnull().sum().sum()}")

In [None]:
# basement
# Check if remaining basement missing cases are truly "No Basement"
basement_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

# Find houses missing ALL basement features
all_missing = train_df[basement_features].isnull().all(axis=1)
houses_no_basement = train_df[all_missing]

# Check their TotalBsmtSF - should be 0 if truly no basement
print(houses_no_basement['TotalBsmtSF'].describe())

In [None]:
# Fill with "None" - they truly have no basement
no_basement_mask = (
    train_df[basement_features].isnull().all(axis=1) & 
    (train_df['TotalBsmtSF'] == 0)
)
for feature in basement_features:
    train_df.loc[no_basement_mask, feature] = 'None'

In [None]:
# Fill with "None" - they truly have no masonery
no_masonery_mask = (
    train_df["MasVnrType"].isnull() & 
    (train_df['MasVnrArea'] == 0)
)

train_df.loc[no_masonery_mask, "MasVnrType"] = 'None'

In [None]:
# Masonry area

train_df[train_df['MasVnrArea'].isnull()][["MasVnrType", "MasVnrArea"]]

In [None]:
# Fill with "None" - null masonery area
missing_both_mask = (
    train_df["MasVnrType"].isnull() & 
    (train_df['MasVnrArea'].isnull())
)

# Fill MasVnrType with 'None' for these houses
train_df.loc[missing_both_mask, 'MasVnrType'] = 'None'

# Fill MasVnrArea with 0 for these houses
train_df.loc[missing_both_mask, 'MasVnrArea'] = 0

In [None]:
# LotFrontage

# Check the Correlation with `LotArea`

# Is there a relationship between LotFrontage and LotArea?
plt.figure(figsize=(10, 6))
sns.scatterplot(x='LotArea', y='LotFrontage', data=train_df)
plt.title('LotFrontage vs. LotArea')
plt.xlabel('Lot Area (sq ft)')
plt.ylabel('Lot Frontage (ft)')
plt.show()

correlation = train_df['LotFrontage'].corr(train_df['LotArea'])
print(f"Correlation between LotFrontage and LotArea: {correlation:.2f}")

positive correlation, but weak and noisy because `LotArea` is `width * depth`. A better proxy might be the square root of `LotArea`.

In [None]:
# Let's try correlating with the square root of LotArea
correlation_sqrt = train_df['LotFrontage'].corr(np.sqrt(train_df['LotArea']))
print(f"Correlation with sqrt(LotArea): {correlation_sqrt:.2f}")

still weak correlation

In [None]:
# Analyze by `Neighborhood`

# Calculate the median LotFrontage for each neighborhood
neighborhood_lotfrontage = train_df.groupby('Neighborhood')['LotFrontage'].median().sort_values()

print("Median LotFrontage by Neighborhood:")
print(neighborhood_lotfrontage)

In [None]:
plt.figure(figsize=(12, 8))
neighborhood_lotfrontage.plot(kind='barh')
plt.title('Median LotFrontage by Neighborhood')
plt.xlabel('Median Lot Frontage (ft)')
plt.ylabel('Neighborhood')
plt.show()

Significant variation. Some neighborhoods (like 'MeadowV') will have smaller frontages, while others (like 'NoRidge') will have much larger ones. This confirms that `Neighborhood` is a very strong predictor for `LotFrontage`.

impute the missing `LotFrontage` values using the median `LotFrontage` of each house's respective `Neighborhood`.

In [None]:
# Impute missing LotFrontage with the median of its neighborhood

print(f"Missing LotFrontage before imputation: {train_df['LotFrontage'].isnull().sum()}")

# Use transform to fill missing values
train_df['LotFrontage'] = train_df.groupby('Neighborhood')['LotFrontage']\
                                  .transform(lambda x: x.fillna(x.median()))

print(f"Missing LotFrontage after imputation: {train_df['LotFrontage'].isnull().sum()}")

In [None]:
# MasVnrType (5 missing)

# Impute the 5 inconsistent MasVnrType cases with the mode

# Find the mode (most common value)
mas_vnr_type_mode = train_df["MasVnrType"].mode()[0]

# Fill the remaining missing values
train_df["MasVnrType"] = train_df["MasVnrType"].fillna(mas_vnr_type_mode)

In [None]:
# BsmtExposure (1 missing) & `BsmtFinType2` (1 missing)

# Impute the remaining basement inconsistencies with the mode

# BsmtExposure
if train_df["BsmtExposure"].isnull().any():
    bsmt_exposure_mode = train_df["BsmtExposure"].mode()[0]
    train_df["BsmtExposure"] = train_df["BsmtExposure"].fillna(bsmt_exposure_mode)

# BsmtFinType2
if train_df["BsmtFinType2"].isnull().any():
    bsmt_fin_type2_mode = train_df["BsmtFinType2"].mode()[0]
    train_df["BsmtFinType2"] = train_df["BsmtFinType2"].fillna(bsmt_fin_type2_mode)

In [None]:
# Electrical (1 missing)

# Impute the single missing Electrical value with the mode

if train_df["Electrical"].isnull().any():
    electrical_mode = train_df["Electrical"].mode()[0]
    train_df["Electrical"] = train_df["Electrical"].fillna(electrical_mode)

In [None]:
train_df.isnull().sum().describe()

In [None]:
len(train_df.columns)

In [None]:
train_df.BsmtFinType1.unique()

#### Separate Numerical and Categorical Features

In [None]:
# Get a list of numerical and categorical features
numerical_features = train_df.select_dtypes(include=np.number).columns.tolist()
categorical_features = train_df.select_dtypes(exclude=np.number).columns.tolist()

print(f"Numerical features ({len(numerical_features)}):")
print(numerical_features)
print("\n" + "-"*50 + "\n")
print(f"Categorical features ({len(categorical_features)}):")
print(categorical_features)

Key insights:

1.Some "numerical" features are actually categorical:

    MSSubClass (building class codes) should probably be treated as categorical
    OverallQual and OverallCond are ordinal ratings (1-10 scales)

2.Rich feature engineering opportunities:

    Age features: Calculate house age from YearBuilt, renovation age from YearRemodAdd
    Area ratios: GrLivArea / LotArea for living space efficiency
    Total features: Sum different area measurements
    Binary features: HasPool, HasGarage, HasBasement from those missing values we found

3.Many categorical features are actually ordinal:

    Quality ratings (Poor → Fair → Good → Excellent) can be encoded numerically
    This gives us flexibility in how we handle them

5.The missing value patterns now make perfect sense:

    Quality features missing = that feature doesn't exist on the house

#### Feature Type Transformations

In [None]:
print("=== FEATURE TYPE TRANSFORMATIONS ===")

# 1. Convert MSSubClass to categorical
train_df['MSSubClass'] = train_df['MSSubClass'].astype(str)

# 2. Quality features to ordinal
quality_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                   'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']

quality_mapping = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

for feature in quality_features:
    if feature in train_df.columns:
        train_df[feature + '_Ordinal'] = train_df[feature].map(quality_mapping)

# 3. Basement exposure ordinal
bsmt_exposure_mapping = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
train_df['BsmtExposure_Ordinal'] = train_df['BsmtExposure'].map(bsmt_exposure_mapping)

# 4. Basement finish types ordinal  
bsmt_finish_mapping = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
for feature in ['BsmtFinType1', 'BsmtFinType2']:
    if feature in train_df.columns:
        train_df[feature + '_Ordinal'] = train_df[feature].map(bsmt_finish_mapping)

print("Feature type transformations completed!")

# Verification
print(f"\nNew ordinal features created: {len([col for col in train_df.columns if '_Ordinal' in col])}")

In [None]:
# Get a list of numerical and categorical features - again to check
numerical_features = train_df.select_dtypes(include=np.number).columns.tolist()
categorical_features = train_df.select_dtypes(exclude=np.number).columns.tolist()

print(f"Numerical features ({len(numerical_features)}):")
print(numerical_features)
print("\n" + "-"*50 + "\n")
print(f"Categorical features ({len(categorical_features)}):")
print(categorical_features)

#### Summary of Numerical Features

In [None]:
# Get a summary of numerical features
print("Summary of Numerical Features:")
train_df[numerical_features].describe().transpose().round(2)

🚨 Major outliers detected:
LotArea: One house with 215,245 sq ft lot (vs median 9,478) - that's 22x larger!
GrLivArea: Max 5,642 sq ft (vs median 1,464) - mansion alert!
TotalBsmtSF: Max 6,110 sq ft basement - larger than most entire houses!

📊 Interesting patterns:
Many features are heavily skewed: PoolArea, MiscVal, 2ndFlrSF (medians are 0)
Typical house profile: Built 1973, ~1,464 sq ft, 3 bedrooms, 2-car garage
Age range: Houses from 1872 to 2010 (138-year span!)

🎯 Great feature engineering opportunities:
Age features: House age, renovation age
Total area: Sum all square footages
Ratios: Living area efficiency, basement percentage
Binary features: Has2ndFloor, HasPool, etc.

⚠️ Data quality issues to investigate:
Extreme outliers that might be data entry errors
17.7% missing LotFrontage values
Some suspicious values (8 bedrooms, 3 kitchens)
This is exactly the kind of thorough analysis that leads to great feature engineering! Ready to start investigating some of these outliers and patterns in more detail?
numerical_summary_insights
Markdown · 3.15 KB

View all files in this task
I think that it could be interesting to share with you the data description
data_description.txt
Text

Suggested new knowledge
Excellent idea! The data description file is crucial for understanding what each feature actually means and how to interpret those outliers and patterns we've been seeing. Let me read through it and provide insights.
Reviewing the numerical summaries, I identified major outliers and skewed distributions, indicating potential data quality issues. Many high values seem to be genuine, but some suspicious entries (e.g., large basement sizes, unusual bedrooms) require further checks. The patterns support extensive feature engineering opportunities, such as creating age, total area, ratio, and binary indicators. Next, I will review the feature descriptions to better understand feature meanings and outlier implications.

I've learned that many outliers are legitimate, such as large lot sizes and multiple kitchens, reflecting different property types. Missing data often indicates absence (e.g., no pool or garage). Features like quality and condition are ordinal, useful for encoding. The detailed data description confirms which features are categorical, ordinal, or continuous, guiding proper preprocessing. Next, I'll focus on proper data type conversions, handle missing values, and create new features like house age and total area for better modeling.

This is AMAZING! 🎉 The data description completely transforms our understanding of the dataset. What initially looked like messy data with lots of missing values is actually a beautifully structured dataset with meaningful patterns.

🔍 Major revelations:

1. Those "missing values" aren't missing at all!
PoolQC missing = No Pool (that's why 1453/1460 are "missing")
Fence missing = No Fence
FireplaceQu missing = No Fireplace
All garage features missing together = No Garage

2. MSSubClass should definitely be categorical!
20 = "1-STORY 1946 & NEWER"
190 = "2 FAMILY CONVERSION"
This explains why some houses have 3 kitchens (duplexes!)

3. Quality features are perfectly ordinal:
Ex > Gd > TA > Fa > Po (Excellent to Poor)
BsmtQual even specifies height ranges: Ex (100+ inches) down to Po (<70 inches)

4. Those "outliers" are actually legitimate:
8 bedrooms could be large family homes
3 kitchens make sense for 2-family conversions
Huge lots could be agricultural zoning

🎯 This opens up incredible feature engineering opportunities:
Binary "Has" features (HasPool, HasGarage, HasBasement)
Proper ordinal encoding for quality features
Age calculations (HouseAge, RemodAge)
Total area combinations

### Feature Engineering

#### Binary "Has" Features

These features capture the presence/absence of important house amenities:

* Justifications for Binary "Has" Features:

Business Logic:

    * Threshold effects: Pool vs no pool matters more than pool size differences
    * Market segmentation: Buyers filter by "must-have" amenities
    * Amenity premiums: Presence commands premium regardless of exact size

Machine Learning Benefits:

    * Captures non-linear relationships: 0→200 sq ft jump bigger than 200→400
    * Robust to outliers: Binary immune to extreme values
    * Feature interactions: HasPool * OverallQual = luxury premium
    * Model interpretability: "Houses with pools sell for $12K more"

Technical Advantages:

    * Information preservation: Keep both binary AND continuous features
    * Handles missing values: None → 0, Any value → 1
    * Algorithm flexibility: Works well with all model types

In [None]:
print("=== CREATING POWERFUL NEW FEATURES ===")

# Set current year for age calculations
current_year = 2023

# 1. Binary "Has" Features
train_df['HasPool'] = (train_df['PoolArea'] > 0).astype(int)
train_df['HasGarage'] = (train_df['GarageArea'] > 0).astype(int)
train_df['HasBasement'] = (train_df['TotalBsmtSF'] > 0).astype(int)
train_df['HasFireplace'] = (train_df['Fireplaces'] > 0).astype(int)
train_df['Has2ndFloor'] = (train_df['2ndFlrSF'] > 0).astype(int)
train_df['HasMasVnr'] = (train_df['MasVnrArea'] > 0).astype(int)
train_df['HasWoodDeck'] = (train_df['WoodDeckSF'] > 0).astype(int)
train_df['HasOpenPorch'] = (train_df['OpenPorchSF'] > 0).astype(int)

# 2. Age Features
train_df['HouseAge'] = current_year - train_df['YearBuilt']
train_df['YearsSinceRemod'] = current_year - train_df['YearRemodAdd']
train_df['GarageAge'] = np.where(train_df['GarageYrBlt'] > 0, current_year - train_df['GarageYrBlt'], 0)
train_df['WasRemodeled'] = (train_df['YearRemodAdd'] != train_df['YearBuilt']).astype(int)

# 3. Total Area Features
train_df['TotalSF'] = train_df['1stFlrSF'] + train_df['2ndFlrSF'] + train_df['TotalBsmtSF']
train_df['TotalBath'] = (train_df['FullBath'] + train_df['HalfBath'] + 
                         train_df['BsmtFullBath'] + train_df['BsmtHalfBath'])
train_df['TotalPorchSF'] = (train_df['OpenPorchSF'] + train_df['EnclosedPorch'] + 
                            train_df['3SsnPorch'] + train_df['ScreenPorch'])

# 4. Ratio Features
train_df['LivingAreaRatio'] = train_df['GrLivArea'] / train_df['LotArea']
train_df['BasementRatio'] = np.where(train_df['TotalSF'] > 0, train_df['TotalBsmtSF'] / train_df['TotalSF'], 0)

# 5. Quality Averages
train_df['ExteriorQualityAvg'] = train_df[['ExterQual_Ordinal', 'ExterCond_Ordinal']].mean(axis=1)
train_df['BasementQualityAvg'] = train_df[['BsmtQual_Ordinal', 'BsmtCond_Ordinal', 'BsmtExposure_Ordinal']].mean(axis=1)

print("All powerful features created!")

# Count new features
new_features = [col for col in train_df.columns if any(keyword in col for keyword in 
                ['Has', 'Age', 'Total', 'Ratio', 'Avg', 'Score', 'Years', 'Was'])]
print(f"Created {len(new_features)} new features!")

In [None]:
# Get a list of numerical and categorical features - again to check
numerical_features = train_df.select_dtypes(include=np.number).columns.tolist()
categorical_features = train_df.select_dtypes(exclude=np.number).columns.tolist()

print(f"Numerical features ({len(numerical_features)}):")
print(numerical_features)
print("\n" + "-"*50 + "\n")
print(f"Categorical features ({len(categorical_features)}):")
print(categorical_features)

In [None]:
# Get a summary of numerical features
pd.set_option('display.max_rows', None)

print("Summary of Numerical Features:")
train_df[numerical_features].describe().transpose().round(2)

In [None]:
# Save the cleaned and engineered dataset
train_df.to_csv('../data/train_cleaned.csv', index=False)
print(f"Cleaned dataset saved: {train_df.shape}")