In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('AmesHousing.csv')

In [3]:
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [5]:
df.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [6]:
df.isnull().sum()

Order               0
PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
                 ... 
Mo Sold             0
Yr Sold             0
Sale Type           0
Sale Condition      0
SalePrice           0
Length: 82, dtype: int64

## Define Target Variable

In [7]:
TARGET_COL = "SalePrice"

df[TARGET_COL].describe

<bound method NDFrame.describe of 0       215000
1       105000
2       172000
3       244000
4       189900
         ...  
2925    142500
2926    131000
2927    132000
2928    170000
2929    188000
Name: SalePrice, Length: 2930, dtype: int64>

In [14]:
numerical_features = df.select_dtypes(include=[np.number]).columns.to_list()
categorical_features = df.select_dtypes(exclude=[np.number]).columns.to_list()
len(numerical_features) , len(categorical_features)


(39, 43)

In [15]:
numerical_features[:10],categorical_features[:10]

(['Order',
  'PID',
  'MS SubClass',
  'Lot Frontage',
  'Lot Area',
  'Overall Qual',
  'Overall Cond',
  'Year Built',
  'Year Remod/Add',
  'Mas Vnr Area'],
 ['MS Zoning',
  'Street',
  'Alley',
  'Lot Shape',
  'Land Contour',
  'Utilities',
  'Lot Config',
  'Land Slope',
  'Neighborhood',
  'Condition 1'])

## Now check for missing values

In [20]:
missing_count = df.isnull().sum()

missing_percent = ((missing_count / len(df)) * 100).round(2)

missing_table = (
    pd.DataFrame({
        "Missing Count": missing_count,
        "Missing Percent": missing_percent
    })
    .sort_values(by="Missing Percent", ascending=False)
)

missing_table[missing_table["Missing Percent"] > 0].head(20)

Unnamed: 0,Missing Count,Missing Percent
Pool QC,2917,99.56
Misc Feature,2824,96.38
Alley,2732,93.24
Fence,2358,80.48
Mas Vnr Type,1775,60.58
Fireplace Qu,1422,48.53
Lot Frontage,490,16.72
Garage Qual,159,5.43
Garage Yr Blt,159,5.43
Garage Cond,159,5.43


In [21]:
structural_cat_features = [
    "Pool QC",
    "Misc Feature",
    "Alley",
    "Fence",
    "Mas Vnr Type",
    "Fireplace Qu",
    "Lot Frontage",
    "Garage Qual",
    "Garage Yr Blt",
    "Garage Cond",
    "Garage Finish",
    "Garage Type",
    "Bsmt Exposure",
    "BsmtFin Type 2",
    "Bsmt Qual",
    "Bsmt Cond",
    "BsmtFin Type 1",
    "Mas Vnr Area",
    "Bsmt Full Bath",
    "Bsmt Half Bath"
]
df[structural_cat_features] = df[structural_cat_features].fillna("None")

In [22]:
structural_num_features = [
    "Garage Cars",
    "Garage Area",
    "Total Bsmt SF",
    "Bsmt Unf SF",
    "BsmtFin SF 1",
    "BsmtFin SF 2",
]
df[structural_num_features] = df[structural_num_features].fillna(0)

### Variance = Spread of Data

#### Lot Frontage

In [None]:
lot_frontage_median = (
    df.groupby("Neighborhood")["Lot Frontage"]  ##.median()
)

df["Lot Frontage"] = df.apply(
    lambda row: lot_frontage_median[row["Neighborhood"]]
    if pd.isna(row["Lot Frontage"]) else row["Lot Frontage"],
    axis=1
)