In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction


Introduction to House Prices - Advanced Regression Techniques
The real estate market is a complex and dynamic sector, where accurately predicting house prices is both a crucial and challenging task. Understanding the factors that influence house prices and developing robust predictive models can provide significant value to various stakeholders, including homeowners, real estate agents, investors, and policymakers. The "House Prices - Advanced Regression Techniques" project delves into this intricate domain by employing sophisticated statistical and machine learning methods to forecast the sale prices of houses.


This project is part of a competition hosted by Kaggle, aimed at providing a hands-on opportunity to apply advanced regression techniques to a real-world dataset. Participants are tasked with building models that can predict the final price of each home in Ames, Iowa, based on its features. The dataset includes a rich array of variables, such as the size of the house, the number of bedrooms, the quality of construction, the neighborhood, and many more.

By leveraging advanced regression techniques, this project aims to develop highly accurate and interpretable models that can predict house prices with a high degree of precision. The insights gained from this project not only enhance the predictive capabilities but also deepen the understanding of the housing market dynamics.

#  Import Libraries

In [None]:
# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing and data transformation
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')


## Loading Dataset

In [None]:
df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

# EDA

In [None]:
df.head()

## Checking Duplicates

In [None]:
df.duplicated().sum()

## Splitting the Dataset

In [None]:
from sklearn.model_selection import train_test_split

# Separate the features and the target variable
X = df.drop('SalePrice', axis=1)  # Features (all columns except 'SalePrice')
y = df['SalePrice']               # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Verify the dimensions of the resulting splits
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_df.shape

# Column Info

In [None]:
with open('/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt','r') as f:
    print(f.read())

# Outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure size and DPI
plt.figure(figsize=(15, 3), dpi=150)

# Create the boxplot
sns.boxplot(x=df['SalePrice'])

# Set the plot orientation
plt.xlabel('SalePrice')

# Show the plot
plt.show()


In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
X_train.dtypes.value_counts()

In [None]:
X_train.describe()

In [None]:
X_train.isnull().sum()

In [None]:
missing_values = X_train.isnull().sum()

missing_values = missing_values[missing_values > 0]

missing_percentage = (missing_values / len(X_train)) * 100

print(missing_percentage)

In [None]:
columns_to_drop = missing_percentage[missing_percentage > 30].index

X_train.drop(columns=columns_to_drop, inplace=True)

print(X_train)

In [None]:
columns_to_fill = missing_percentage[missing_percentage < 30].index
columns_to_fill

In [None]:
for column in columns_to_fill:
    if X_train[column].dtype == "float64" or X_train[column].dtype == "int64":
        X_train[column].fillna(X_train[column].mean(), inplace=True)
    else:

        X_train[column].fillna(X_train[column].mode()[0], inplace=True)
X_train.head()

In [None]:
X_train.isnull().sum().sum()

In [None]:
X_train.Id.nunique()

In [None]:
X_train.drop(columns=['Id'], inplace=True)

In [None]:
num_data = X_train.select_dtypes(include=np.number).columns.tolist()

In [None]:
cat_data = X_train.select_dtypes(exclude=np.number).columns.tolist()

In [None]:
corr = X_train[num_data].corr()
plt.subplots(1,1, figsize=(25,25))
sns.heatmap(data=corr, cmap ='Greens', annot = corr, cbar=None)
plt.title('Correlation between features')
plt.show()

Note: Remove one of the two features with a correlation greater than 0.6.

In [None]:
num_data.remove('GarageArea')

In [None]:
num_data.remove('1stFlrSF')

In [None]:
num_data.remove('GrLivArea')

In [None]:
num_data.remove('BsmtFullBath')

In [None]:
num_data.remove('FullBath')

In [None]:
num_data.remove('HalfBath')

In [None]:
num_data.remove('TotRmsAbvGrd')

In [None]:
num_data.remove('GarageYrBlt')

In [None]:
corr_new = X_train[num_data].corr()
plt.subplots(1,1, figsize=(25,25))
sns.heatmap(data=corr_new, cmap ='Greens', annot = corr_new, cbar=None)
plt.title('Correlation between features')
plt.show()

## Check outliers and distributions

In [None]:
fig, ax = plt.subplots(len(num_data), 1, figsize=(6, 50))
for axs, feature in zip(ax, num_data):
    X_train[[feature]].boxplot(ax=axs)

# Target Exploration

In [None]:
y_train.value_counts()

In [None]:
X = df.drop('SalePrice',axis=1)
y = df['SalePrice']

# Regression Models

## Creating a pipeline

In [None]:
num_pipe = make_pipeline(SimpleImputer(strategy='mean'),MinMaxScaler())
cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
                         OneHotEncoder(handle_unknown='ignore'))
preprocessor = ColumnTransformer([
    ('num_pipe',num_pipe,num_data),
    ('cat_pipe',cat_pipe,cat_data)
])
preprocessor

In [None]:
def prediction(model):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    return f"{model}'s RMSE is {np.sqrt(mean_squared_error(y_test, y_pred))}, MAE is {mean_absolute_error(y_test, y_pred)}, R2 is {r2_score(y_test, y_pred)}"

### LinearRegression model

In [None]:
prediction(LinearRegression())

### DecisionTree model

In [None]:
prediction(DecisionTreeRegressor(random_state=42))

### RandomForest model

In [None]:
prediction(RandomForestRegressor(n_estimators=100, random_state=42,n_jobs=-1))

### GradientBoosting

In [None]:
prediction(GradientBoostingRegressor(random_state=42))

# RidgeCV

In [None]:
prediction(RidgeCV())

# ElasticNetCV

In [None]:
prediction(ElasticNetCV())

# LassoCV

In [None]:
prediction(LassoCV())

GradientBoosting

In [None]:
best_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=4, random_state=42)
prediction(best_model)

# Submission

In [None]:
test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
id = test_df['Id'].copy()
test_df = test_df.drop('Id', axis=1)
test_preprocessed = preprocessor.transform(test_df)
predictions = best_model.predict(test_preprocessed)
predictions_df = pd.DataFrame({
    'Id': id, 
    'SalePrice': predictions
})
predictions_df.to_csv('/kaggle/working/predicted_prices_submission.csv', index=False)

print("Done，results saved into /kaggle/working/predicted_prices_submission.csv")

In [None]:
submission= pd.DataFrame(data = {'Id': range(1461,2920),
                                'SalePrice': predictions})
submission

In [None]:
predictions_df.head()