# House Prices Modeling

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_log_error

In [9]:
## Load the dataset

In [10]:
train_df = pd.read_csv('E:/dsp_bhagyasri_parupudi/data/train.csv')
test_df = pd.read_csv('E:/dsp_bhagyasri_parupudi/data/test.csv')

In [11]:
## Split the data

In [12]:
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
## Select features

In [14]:
continuous_features = ['LotArea', 'GrLivArea']
categorical_features = ['MSZoning', 'Neighborhood']

In [15]:
## Handle missing values in categorical features

In [16]:
imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_features] = imputer.fit_transform(X_train[categorical_features])
X_test[categorical_features] = imputer.transform(X_test[categorical_features])

In [17]:
## Scale continuous features

In [18]:
scaler = StandardScaler()
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

In [19]:
## Encode categorical features

In [20]:
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train[categorical_features]).toarray()
X_test_encoded = encoder.transform(X_test[categorical_features]).toarray()

X_train_processed = np.concatenate([X_train[continuous_features].reset_index(drop=True),
                                     X_train_encoded], axis=1)
X_test_processed = np.concatenate([X_test[continuous_features].reset_index(drop=True),
                                   X_test_encoded], axis=1)

In [21]:
## Train a simple model

In [22]:
model = LinearRegression()
model.fit(X_train_processed, y_train)

In [23]:
## Predict and evaluate

In [24]:
y_pred = model.predict(X_test_processed)

def compute_rmsle(y_test, y_pred, precision=2):
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_test, y_pred)
print(f'RMSLE: {rmsle}')

# Display a random sample of 20 rows from the test set predictions
print("\nRandom sample of 20 rows from the test set predictions:")
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions_df.sample(20))

RMSLE: 0.21

Random sample of 20 rows from the test set predictions:
      Actual      Predicted
590   185900  205700.351214
1132  117500  182559.284909
836   153500  133657.199046
361   145000  144337.950188
422   113000  138399.435551
669   137500  166703.468488
777   142500  115310.462510
259    97000   86438.884128
984   126000  186748.419496
59    124900  146071.936435
1421  127500  131475.231559
451   280000  221320.174108
906   255000  262000.908056
220   204900  199897.267947
1088  137500  157517.984872
1054  255000  237353.076258
695   176000  184545.000700
128   155000  148091.235593
316   260000  235729.589912
192   192000  194983.572422


In [34]:
# Save the processed training and test sets
pd.DataFrame(X_train_processed).to_parquet('../models/X_train_processed_ref1.parquet', index=False)
pd.DataFrame(X_test_processed).to_parquet('../models/X_test_processed_ref1.parquet', index=False)


In [35]:
!pip install pyarrow





In [36]:
!pip install fastparquet



In [38]:
# Ensure column names match reference (strings)
X_train_processed = pd.DataFrame(X_train_processed)
X_train_processed.columns = X_train_processed.columns.astype(str)

X_test_processed = pd.DataFrame(X_test_processed)
X_test_processed.columns = X_test_processed.columns.astype(str)

In [40]:
expected_X_train = pd.read_parquet('../models/X_train_processed_ref1.parquet')
pd.DataFrame(X_train_processed).equals(expected_X_train)  # Now uses RangeIndex

True

In [41]:
# Load reference data (saved in Step 0.1)
expected_X_train = pd.read_parquet('../models/X_train_processed_ref1.parquet')
expected_X_test = pd.read_parquet('../models/X_test_processed_ref1.parquet')

try:
    pd.testing.assert_frame_equal(X_train_processed, expected_X_train, check_dtype=True)
    pd.testing.assert_frame_equal(X_test_processed, expected_X_test, check_dtype=True)
    print("✅ All assertions passed! DataFrames are identical.")
except AssertionError as e:
    print("❌ Assertion failed:", e)

✅ All assertions passed! DataFrames are identical.


In [42]:
import joblib
joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(encoder, '../models/encoder.joblib')

['../models/encoder.joblib']