In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [7]:
train=pd.read_csv(rf"C:\Users\user\Downloads\house-prices-advanced-regression-techniques\train.csv")
test=pd.read_csv(rf"C:\Users\user\Downloads\house-prices-advanced-regression-techniques\test.csv")
submission=pd.read_csv(rf"C:\Users\user\Downloads\house-prices-advanced-regression-techniques\sample_submission.csv")

In [8]:
train.shape

(1460, 81)

In [9]:
test.shape

(1459, 80)

In [10]:
submission.shape

(1459, 2)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [12]:
y = np.log1p(train['SalePrice'])
train.drop(columns=['SalePrice'],inplace=True)

# Feature Engineering

In [13]:
all_data=pd.concat([train,test],keys=['train','test'])

In [14]:
numeric_features = all_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = all_data.select_dtypes(include=['object']).columns

In [15]:
numeric_transform = Pipeline(steps=[
    ('impute_data', SimpleImputer(strategy='median')),
    ('scale_data', StandardScaler())])

In [16]:
numeric_transform

In [17]:
categorical_transform = Pipeline(steps=[('impute_data', SimpleImputer(strategy='most_frequent')),
                                         ('enco_data',OneHotEncoder(handle_unknown='ignore'))])

In [18]:
categorical_transform

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transform, numeric_features),
        ('cate', categorical_transform, categorical_features)])

In [20]:
preprocessor

In [21]:
model=Pipeline(steps=[('perpossing',preprocessor),
                        ('regressor', XGBRegressor(objective='reg:squarederror', n_estimators=1000,
                                   learning_rate=0.05, max_depth=3, subsample=0.7, colsample_bytree=0.7))])


In [22]:
model

In [23]:
train_data=all_data.loc['train']

In [24]:
test_data=all_data.xs('test')

In [25]:
train_data.shape

(1460, 80)

In [26]:
test_data.shape

(1459, 80)

In [27]:
model.fit(train_data,y)

In [28]:
prediction=model.predict(test_data)

In [29]:
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [30]:
cross_val_result=cross_val_score(model,train_data,y,cv=kf)

In [31]:
cross_val_result

array([0.91354258, 0.92057128, 0.82515471, 0.9145647 , 0.91920709])

# It almost 90% accurate

In [32]:
cross_val_result.mean() # 90% accuracy of the model

0.8986080701614133

In [33]:
submission['SalePrice']=np.expm1(prediction)

In [34]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,121095.9375
1,1462,157504.9375
2,1463,183496.796875
3,1464,186979.859375
4,1465,175281.609375


In [35]:
submission.to_csv('submission.csv', index=False)