In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import os 


### Data Connection 

In [2]:
data = pd.read_csv("train.csv")

print(data.size)
print(data.shape)

data.columns


118260
(1460, 81)


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Data Exploration 

In [3]:
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
from ipywidgets import interact

@interact
def explore_columns(column=data.columns):
    col_data = data[column]
    
    print(f" {'='*10}  ANALYSIS FOR: {column}  {'='*10}")
    print(f"Basic Type: {col_data.dtype}")
    
    # 1. Missing Value Logic
    total_count = len(data)
    null_count = col_data.isnull().sum()
    null_pct = (null_count / len(data)) * 100
    print(f"Total count = {total_count} | Missing: {null_count} ({null_pct:.2f}%)")
    
    # 2. Memory Usage (Great for large datasets)
    memory = col_data.memory_usage(deep=True) / 1024  # in KB
    print(f"Memory Usage: {memory:.2f} KB")

    # 3. Numeric vs Categorical logic
    if np.issubdtype(col_data.dtype, np.number):
        # Numerical stats using NumPy/Pandas
        print(f"Min: {col_data.min()} | Max: {col_data.max()}")
        print(f"Mean: {col_data.mean():.2f} | Median: {col_data.median():.2f}")
        print(f"Skewness: {col_data.skew():.2f}") # Tells if data is lopsided
        print(f"Outliers (Z-score > 3): {(np.abs(col_data - col_data.mean()) > (3 * col_data.std())).sum()}")
    else:
        # Categorical stats
        print(f"Unique Values: {col_data.nunique()}")
        print(f"Top 3 Categories:\n{col_data.value_counts().head(3)}")

    # 4. Sample Data
    print(f"\nSample Data (First 5): {col_data.dropna().unique()[:5]}")

interactive(children=(Dropdown(description='column', options=('Id', 'MSSubClass', 'MSZoning', 'LotFrontage', '…

In [5]:
data.sample(5,random_state=1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
258,259,60,RL,80.0,12435,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2008,WD,Normal,231500
267,268,75,RL,60.0,8400,Pave,,Reg,Bnk,AllPub,...,0,,,,0,7,2008,WD,Normal,179500
288,289,20,RL,,9819,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,122000
649,650,180,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,12,2007,WD,Normal,84500
1233,1234,20,RL,,12160,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,5,2010,COD,Abnorml,142000


## checking null values 

In [6]:
# Numerical Chekings 
null_values_counts = data.isnull().sum()
print(null_values_counts[null_values_counts > 0])

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [7]:
# Using percentage count

null_values_percentage = data.isnull().sum() * 100 / len(data)
missing_values = null_values_percentage[null_values_percentage > 0].sort_values(ascending=False)

print(missing_values)



PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
MasVnrType      59.726027
FireplaceQu     47.260274
LotFrontage     17.739726
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
BsmtFinType2     2.602740
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtCond         2.534247
BsmtQual         2.534247
MasVnrArea       0.547945
Electrical       0.068493
dtype: float64


In [8]:
# creating the null value dataframe 
null_count = data.isnull().sum()
missing_percentage = null_count*100/len(data)

missing_summury = pd.DataFrame({
    'Percentage': missing_percentage,
    'Count': null_count,
})

missing_summury = missing_summury[missing_summury['Percentage'] > 0].sort_index(ascending=False)
missing_summury

Unnamed: 0,Percentage,Count
PoolQC,99.520548,1453
MiscFeature,96.30137,1406
MasVnrType,59.726027,872
MasVnrArea,0.547945,8
LotFrontage,17.739726,259
GarageYrBlt,5.547945,81
GarageType,5.547945,81
GarageQual,5.547945,81
GarageFinish,5.547945,81
GarageCond,5.547945,81


In [9]:
total_nulls = np.sum(data.isnull().values)
print(total_nulls)


7829


In [10]:
data.loc[51:60,['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea']] 

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea
51,50,RM,52.0,6240
52,90,RM,110.0,8472
53,20,RL,68.0,50271
54,80,RL,60.0,7134
55,20,RL,100.0,10175
56,160,FV,24.0,2645
57,60,RL,89.0,11645
58,60,RL,66.0,13682
59,20,RL,60.0,7200
60,20,RL,63.0,13072


In [11]:
data.loc['MSSubClass'] = 70

In [12]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [13]:
data['LotFrontage'] = data['LotFrontage'] + 0.5