In [229]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [230]:
train_data = pd.read_csv('D:\\MY WORK\\Python\\Basic Project\\house-prices-advanced-regression-techniques\\train.csv')
test_data = pd.read_csv('D:\\MY WORK\\Python\\Basic Project\\house-prices-advanced-regression-techniques\\test.csv')

In [231]:
pd.set_option('display.max_columns', None)  # None means no limit

In [232]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [233]:
# Feature Engineering

# Temporal Features
train_data['HouseAge'] = train_data['YrSold'] - train_data['YearBuilt']
train_data['RemodelAge'] = train_data['YrSold'] - train_data['YearRemodAdd']

# Pool and Fence Features
train_data['HasPool'] = (train_data['PoolArea'] > 0).astype(int)
train_data['HasFence'] = train_data['Fence'].notna().astype(int)

# Living Space Features
train_data['TotalLivingArea'] = train_data['GrLivArea'] + train_data['1stFlrSF'] + train_data['2ndFlrSF']

# Garage Features
train_data['GarageAge'] = train_data['YrSold'] - train_data['GarageYrBlt']

# Quality Indicators
train_data['QualityIndex'] = train_data['OverallQual'] + train_data['OverallCond']

In [234]:
'''train_data.drop(columns=['YrSold', 'YearBuilt', 'YearRemodAdd'], inplace=True)
train_data.drop(columns=['PoolArea', 'Fence'], inplace=True)
train_data.drop(columns=['GrLivArea', '1stFlrSF', '2ndFlrSF'], inplace=True)
train_data.drop(columns=['GarageYrBlt'], inplace=True)
train_data.drop(columns=['OverallQual', 'OverallCond'], inplace=True)'''

"train_data.drop(columns=['YrSold', 'YearBuilt', 'YearRemodAdd'], inplace=True)\ntrain_data.drop(columns=['PoolArea', 'Fence'], inplace=True)\ntrain_data.drop(columns=['GrLivArea', '1stFlrSF', '2ndFlrSF'], inplace=True)\ntrain_data.drop(columns=['GarageYrBlt'], inplace=True)\ntrain_data.drop(columns=['OverallQual', 'OverallCond'], inplace=True)"

In [235]:
x = train_data.drop(columns=['Id','SalePrice'])
y = (train_data['SalePrice'])

In [236]:
x.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,HouseAge,RemodelAge,HasPool,HasFence,TotalLivingArea,GarageAge,QualityIndex
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,5,5,0,0,3420,5.0,12
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,31,31,0,0,2524,31.0,14
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,7,6,0,0,3572,7.0,12
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,91,36,0,0,3434,8.0,12
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,8,8,0,0,4396,8.0,13


In [237]:
# Extracting categorical features
categorical_features = x.select_dtypes(include=['object', 'category']).columns.tolist()

# Extracting numerical features
numerical_features = x.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Display the results
print("Categorical Features:")
print(categorical_features)

print("\nNumerical Features:")
print(numerical_features)

Categorical Features:
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

Numerical Features:
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'Garage

In [238]:
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [239]:
# Pipeline 1: Linear Regression
pipeline1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Pipeline 2: Random Forest
pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Pipeline 3: Gradient Boost
pipeline3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

In [240]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2 ,random_state=42)

In [241]:
# Fit the Linear Regression model
pipeline1.fit(x_train, y_train)
predictions1 = pipeline1.predict(x_test)
rmse1 = np.sqrt(mean_squared_error(y_test, predictions1))
print("Linear Regression RMSE:", rmse1)

Linear Regression RMSE: 65465.234459504754


In [242]:
# Fit the Random Forest model
pipeline2.fit(x_train, y_train)
predictions2 = pipeline2.predict(x_test)
rmse2 = np.sqrt(mean_squared_error(y_test, predictions2))
print("Random Forest RMSE:", rmse2)

Random Forest RMSE: 28852.592933678414


In [243]:
# Fit the Gradient Boost
pipeline3.fit(x_train, y_train)
predictions3 = pipeline3.predict(x_test)
rmse3 = np.sqrt(mean_squared_error(y_test, predictions3))
print("Random Forest RMSE:", rmse3)

Random Forest RMSE: 26415.25599546747


In [249]:
# Feature Engineering

# Temporal Features
test_data['HouseAge'] = test_data['YrSold'] - test_data['YearBuilt']
test_data['RemodelAge'] = test_data['YrSold'] - test_data['YearRemodAdd']

# Pool and Fence Features
test_data['HasPool'] = (test_data['PoolArea'] > 0).astype(int)
test_data['HasFence'] = test_data['Fence'].notna().astype(int)

# Living Space Features
test_data['TotalLivingArea'] = test_data['GrLivArea'] + test_data['1stFlrSF'] + test_data['2ndFlrSF']

# Garage Features
test_data['GarageAge'] = test_data['YrSold'] - test_data['GarageYrBlt']

# Quality Indicators
test_data['QualityIndex'] = test_data['OverallQual'] + test_data['OverallCond']

In [250]:
predictions = pipeline3.predict(test_data)

In [251]:
print(f"Test Data Shape: {test_data.shape}")
print(f"Predictions Shape: {predictions.shape}")


Test Data Shape: (1459, 87)
Predictions Shape: (1459,)


In [252]:
# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

In [253]:
from joblib import dump  # Import the dump function

# Save your trained model pipeline
dump(pipeline3, 'house_pred_model.joblib')  # Change the filename as needed


['house_pred_model.joblib']