In [1]:
import pandas as pd

# Assuming you already loaded your dataset
train = pd.read_csv('/Users/lumanzhan/downloads/data/train.csv')

# Check total and percentage of missing values
missing_counts = train.isnull().sum()
missing_percent = (train.isnull().sum() / len(train)) * 100

missing_summary = pd.DataFrame({'Missing Count': missing_counts, 'Missing %': missing_percent})
print(missing_summary[missing_summary['Missing Count'] > 0])

              Missing Count  Missing %
LotFrontage             259  17.739726
Alley                  1369  93.767123
MasVnrType              872  59.726027
MasVnrArea                8   0.547945
BsmtQual                 37   2.534247
BsmtCond                 37   2.534247
BsmtExposure             38   2.602740
BsmtFinType1             37   2.534247
BsmtFinType2             38   2.602740
Electrical                1   0.068493
FireplaceQu             690  47.260274
GarageType               81   5.547945
GarageYrBlt              81   5.547945
GarageFinish             81   5.547945
GarageQual               81   5.547945
GarageCond               81   5.547945
PoolQC                 1453  99.520548
Fence                  1179  80.753425
MiscFeature            1406  96.301370


In [2]:
# 1. Drop only columns you truly don't need
train.drop(columns=['LotFrontage'], inplace=True, errors='ignore')

# 2. Fill numeric columns with median (only if they exist)
if 'LotArea' in train.columns:
    train['LotArea'].fillna(train['LotArea'].median(), inplace=True)

# 3. Fill categorical columns with mode (only if they exist)
if 'Alley' in train.columns and not train['Alley'].mode().empty:
    train['Alley'].fillna(train['Alley'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['LotArea'].fillna(train['LotArea'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Alley'].fillna(train['Alley'].mode()[0], inplace=True)


In [3]:
# Check missing data
print(train.isnull().sum())

# Drop irrelevant columns
train.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Fill numeric columns with median
for col in train.select_dtypes(include='number').columns:
    train[col].fillna(train[col].median(), inplace=True)

# Fill categorical columns with mode
for col in train.select_dtypes(include='object').columns:
    if train[col].mode().size > 0:
        train[col].fillna(train[col].mode()[0], inplace=True)

# Verify
print("✅ Missing values handled:")
print(train.isnull().sum().sum(), "missing values remain.")

Id               0
MSSubClass       0
MSZoning         0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 80, dtype: int64
✅ Missing values handled:
0 missing values remain.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se