<a href="https://colab.research.google.com/github/Abhijeetkhade11/KaggleCompetitions/blob/main/house_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


import numpy as np
import pandas as pd


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.info()


In [None]:
# Drop Unnecessary columns

train_df.drop(columns=['Id','Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature'],axis=0,inplace=True)

In [None]:
train_df.info()

In [None]:
col_empty = train_df.isnull().sum()[train_df.isnull().sum() > 0]
col_empty

In [None]:
col_empty.info()

In [None]:
col_empty

In [None]:
# Fill missing values using average
col_empty_fill=['LotFrontage','MasVnrArea','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1'
                ,'BsmtFinType2','Electrical','GarageType','GarageYrBlt','GarageFinish','GarageQual',
               'GarageCond']

num_cols = train_df[col_empty_fill].select_dtypes(include='number').columns
cat_cols = train_df[col_empty_fill].select_dtypes(include='object').columns

In [None]:
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].mean())


In [None]:
train_df[cat_cols] = train_df[cat_cols].fillna(train_df[cat_cols].mode())

In [None]:
train_df.info()

In [None]:
train_df['SalePrice'].describe()

In [None]:
sns.histplot(train_df['SalePrice'], bins=50, kde=True)

# Model Training

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [None]:
y = train_df['SalePrice']
X = train_df.drop('SalePrice', axis=1)

# Feature Engineering
X['TotalSF'] = (
    X['TotalBsmtSF'] +
    X['1stFlrSF'] +
    X['2ndFlrSF']
)

X['HouseAge'] = X['YrSold'] - X['YearBuilt']
X['RemodAge'] = X['YrSold'] - X['YearRemodAdd']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size= 0.2,
    random_state=42
)


In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [None]:

num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer, num_cols),
        ('cat', Pipeline([
            ('imputer', cat_imputer),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols)
    ]
)



In [None]:
model = Pipeline([
    ('preprocess', preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse:.3f}')


In [None]:

test_df['TotalSF'] = (
    test_df['TotalBsmtSF'] +
    test_df['1stFlrSF'] +
    test_df['2ndFlrSF']
)

test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['RemodAge'] = test_df['YrSold'] - test_df['YearRemodAdd']

In [None]:
test_preds = np.expm1(model.predict(test_df))


In [None]:
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_preds
})

In [None]:
submission.to_csv('submission.csv', index=False)
