In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [3]:
# Load the dataset
df = pd.read_csv("C:\\Users\\asaha\\Downloads\\train.csv")

In [4]:
# Display the first few rows of the dataset
print(df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [5]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [6]:
# Descriptive statistics
print(df.describe())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

In [7]:
# Data Preprocessing
# Handle missing values
# For numerical columns, we'll use median to fill missing values
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
imputer = SimpleImputer(strategy='median')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

In [8]:
# For categorical columns, we'll use the most frequent value to fill missing values
categorical_cols = df.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer.fit_transform(df[categorical_cols])

In [9]:
# Convert categorical features to numerical using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [10]:
# Feature Engineering
# Create new feature: TotalSF (Total Square Feet)
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

In [11]:
# Split the dataset into training and testing sets
X = df.drop(columns=['SalePrice', 'Id'])
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Display the preprocessed and engineered dataset
print(X_train[:5])
print(y_train[:5])

[[-8.66764305e-01 -5.47960924e-02 -4.19318912e-03 -2.12895710e-01
   5.86210382e-02 -1.62363908e-01  7.65534965e-01  2.99797705e-01
  -2.92728207e-02  6.30128495e-01 -2.21711422e-01 -3.98676874e-02
  -3.23291009e-02 -3.49941390e-02 -4.06196906e-01 -5.50975312e-01
  -8.20444558e-01  3.72217301e-01 -4.55468963e-01 -1.34606303e+00
  -4.98500279e-01 -1.19904284e-01 -5.23592118e-01 -6.95618688e-01
  -2.57595987e-01 -5.97888700e-01  6.60379021e-01 -2.28531528e+00
  -5.53898814e-01  8.32883184e-01  2.90472494e-01  6.22802891e-01
   6.62971898e-01  1.03726861e+00  3.01766507e-01 -2.85504061e-01
  -4.00281645e-01  5.72612193e-01 -1.28197519e-01  1.40155939e+00
   2.76582168e-01  2.95982070e-01  3.74235230e-01 -8.01922924e-01
  -1.18998664e-01 -4.07093148e-01  1.10531958e+00 -2.42870023e-01
  -1.05556573e+00 -7.64097523e-01  1.36218320e-01 -2.12757112e-01
   7.96415828e-01 -9.64565909e-01  2.63599626e-01 -9.58592150e-01
  -4.25722029e-01 -6.87680057e-01 -9.04402770e-01 -2.72829509e-01
  -1.05654