In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import os

# Load the datasets
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Display the shape of the datasets
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Check for missing values in the training data
print("Missing values in training data:")
print(train_df.isnull().sum().sort_values(ascending=False))

# Handle missing values
# Fill numeric missing values with median
numeric_cols_train = train_df.select_dtypes(include=np.number).columns
numeric_cols_test = test_df.select_dtypes(include=np.number).columns

train_df[numeric_cols_train] = train_df[numeric_cols_train].fillna(train_df[numeric_cols_train].median())
test_df[numeric_cols_test] = test_df[numeric_cols_test].fillna(test_df[numeric_cols_test].median())

# Fill categorical missing values with mode
categorical_cols = train_df.select_dtypes(include='object').columns
for col in categorical_cols:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

# One-hot encoding for categorical variables
train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

# Align columns between train and test sets
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Ensure no remaining NaNs
assert train_df.isnull().sum().sum() == 0, "There are still missing values in the training data!"
assert test_df.isnull().sum().sum() == 0, "There are still missing values in the test data!"

# Prepare features and target variable
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", rmse)

# Save the model
import joblib
joblib.dump(model, 'house_price_model.pkl')

Train shape: (1460, 81)
Test shape: (1459, 80)
Missing values in training data:
PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
MasVnrType      872
               ... 
ExterQual         0
Exterior2nd       0
Exterior1st       0
RoofMatl          0
SalePrice         0
Length: 81, dtype: int64
RMSE: 28352.112565630196


['house_price_model.pkl']