In [17]:
# House Price Prediction using Linear Regression

## Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load Data
train_path = '../data/train.csv'
test_path = '../data/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display initial rows of the dataset
print("Training Data Sample:\n")
print(train_df.head())

## Data Preprocessing
# Selecting relevant features
features = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'FullBath', 'SalePrice']
train_selected = train_df[features].dropna()

# Feature Engineering
train_selected['TotalSqFt'] = train_selected['GrLivArea'] + train_selected['TotalBsmtSF'] + train_selected['1stFlrSF'] + train_selected['2ndFlrSF']

# Define features and target
X = train_selected[['TotalSqFt', 'BedroomAbvGr', 'FullBath']]
y = train_selected['SalePrice']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training
model = LinearRegression()
model.fit(X_train, y_train)

# Making Predictions
predictions = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

print(f"\nRoot Mean Squared Error (RMSE): {rmse}")

# Coefficients
print("\nModel Coefficients:\n")
print(f"TotalSqFt Coefficient: {model.coef_[0]}")
print(f"BedroomAbvGr Coefficient: {model.coef_[1]}")
print(f"FullBath Coefficient: {model.coef_[2]}")

## Making Submission
# Prepare the test data
test_features = test_df[['GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'FullBath']].copy()
test_features['TotalSqFt'] = test_features['GrLivArea'] + test_features['TotalBsmtSF'] + test_features['1stFlrSF'] + test_features['2ndFlrSF']
test_features.fillna(test_features.mean(), inplace=True)

# Making predictions for test dataset
test_predictions = model.predict(test_features[['TotalSqFt', 'BedroomAbvGr', 'FullBath']])

# Prepare submission file
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_predictions
})
submission.to_csv('../data/submission.csv', index=False)
print("\nSubmission file saved as 'submission.csv'.")


Training Data Sample:

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice 