In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Data Preprocessing and Feature Engineering

# Separate target variable
y = train_df['SalePrice']
train_df.drop(['SalePrice'], axis=1, inplace=True)

# Concatenate train and test data for consistent preprocessing
data = pd.concat([train_df, test_df], sort=False)

# Handle missing values
numerical_features = data.select_dtypes(include=[np.number]).columns
categorical_features = data.select_dtypes(include=[object]).columns

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

data[numerical_features] = num_imputer.fit_transform(data[numerical_features])
data[categorical_features] = cat_imputer.fit_transform(data[categorical_features])

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [3]:
# Split back into train and test sets
train_df = data.iloc[:len(y), :]
test_df = data.iloc[len(y):, :]

# Feature scaling
scaler = StandardScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

# Split the training data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_df, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[numerical_features] = scaler.transform(test_df[numerical_features])


In [7]:
# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print(f'Validation RMSE: {rmse}')
print(f'Validation R^2 Score: {r2}')

# Train on the full training set and predict on the test set
model.fit(train_df, y)
test_predictions = model.predict(test_df)


Validation RMSE: 28718.585783418937
Validation R^2 Score: 0.8924743032927607


In [8]:
# Create submission file
submission = pd.read_csv('sample_submission.csv')
submission['SalePrice'] = test_predictions
submission.to_csv('submission.csv', index=False)