# House Prices Modeling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_log_error

In [2]:
## Load the dataset

In [3]:
train_df = pd.read_csv('E:/dsp_bhagyasri_parupudi/data/train.csv')
test_df = pd.read_csv('E:/dsp_bhagyasri_parupudi/data/test.csv')

In [5]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [8]:
# Feature selection for splitting  to use for data splitting 
cont_features = ['OverallQual', 'GrLivArea', 'TotRmsAbvGrd', 'BsmtFinSF1']
catg_features = ['Neighborhood', 'GarageFinish', 'ExterCond', 'BsmtQual']
target = ['SalePrice']

In [9]:
from sklearn.model_selection import train_test_split
X = train_df[cont_features + catg_features]
y = train_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Performing OneHotEncoder function
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Fitting encoder for training categorical data and transform both sets
X_train_cat_encoded = onehot_encoder.fit_transform(X_train[catg_features])
X_test_cat_encoded = onehot_encoder.transform(X_test[catg_features])

# Convert to DataFrames with proper column names
encoded_cols = onehot_encoder.get_feature_names_out(catg_features)
X_train_cat_encoded_df = pd.DataFrame(
    X_train_cat_encoded.toarray(), columns=encoded_cols
)
X_test_cat_encoded_df = pd.DataFrame(
    X_test_cat_encoded.toarray(), columns=encoded_cols
)

In [11]:
# Combine continuous and encoded categorical features
X_train_final = pd.concat([
    X_train[cont_features].reset_index(drop=True),
    X_train_cat_encoded_df
], axis=1)

X_test_final = pd.concat([
    X_test[cont_features].reset_index(drop=True),
    X_test_cat_encoded_df
], axis=1)

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train_final)
X_test_scaled = sc.transform(X_test_final)

In [13]:
X_train_final1 = pd.DataFrame(X_train_final, columns=X_train.columns)
processed_df = pd.concat([X_train_final1,y_train.reset_index(drop=True)], axis=1)

In [14]:
y_train = y_train.values.ravel()

In [15]:
from sklearn.ensemble import RandomForestRegressor 
reg = RandomForestRegressor(n_estimators = 100,random_state = 42)
reg.fit(X_train_scaled,y_train)

In [16]:
y_pred = reg.predict(X_test_scaled)

In [17]:
from sklearn.metrics import mean_squared_log_error

In [18]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [19]:
rmsle = compute_rmsle(y_test, y_pred)
print(f'RMSLE: {rmsle}')

RMSLE: 0.16


In [20]:
processed_df.to_parquet('../data/processed_df.parquet', index=False)
print("saved parquet file successfully!")

saved parquet file successfully!
