In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_log_error
import scipy.sparse as sp


## Model building

# Model training

In [3]:
train_data = pd.read_csv('D:/Epita Class/SEM 2/DSP Folder/train.csv')


In [6]:
continuous_features = ['LotArea', 'GrLivArea']  
categorical_features = ['MSZoning', 'Street']     

In [9]:
# Data Setup: Splitting into train and test sets
X = train_data[continuous_features + categorical_features]
y = train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
scaler = StandardScaler()
scaler.fit(X_train[continuous_features])
X_train[continuous_features] = scaler.transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

In [11]:

encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

In [12]:


X_train_final = sp.hstack((X_train_encoded, X_train[continuous_features].values), format='csr')
X_test_final = sp.hstack((X_test_encoded, X_test[continuous_features].values), format='csr')

In [13]:
model = RandomForestRegressor()  # Change the model as needed
model.fit(X_train_final, y_train)

In [14]:
y_pred = model.predict(X_test_final)

# Model Evaluation

In [15]:

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


In [17]:
rmsle_score = compute_rmsle(np.log(y_test), np.log(y_pred))
print("Root-Mean-Squared-Error (RMSE) Score:", rmsle_score)

Root-Mean-Squared-Error (RMSE) Score: 0.02


# Model inference

In [18]:
test_data = pd.read_csv('D:/Epita Class/SEM 2/DSP Folder/test.csv')

In [19]:
test_continuous_features = ['LotArea', 'GrLivArea']  
test_categorical_features = ['MSZoning', 'Street']

In [20]:
# Apply the same preprocessing steps used for training data
encoder = OneHotEncoder(handle_unknown='ignore')
test_X = test_data[test_continuous_features + test_categorical_features]
test_X[test_continuous_features] = scaler.transform(test_X[test_continuous_features])  # Scaling
encoder.fit(X_train[categorical_features])
test_X_encoded = encoder.transform(test_X[test_categorical_features])  # Encoding
test_X_final = sp.hstack((test_X_encoded, test_X[test_continuous_features].values), format='csr')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_X[test_continuous_features] = scaler.transform(test_X[test_continuous_features])  # Scaling


In [21]:
# Predicting the house prices of this data
test_y_pred = model.predict(test_X_final)

In [22]:
test_y_pred

array([104663.  , 150917.  , 219489.14, ..., 172194.  , 131364.  ,
       235942.8 ])