In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
print(test_df.head)
print(test_df.columns)
backup_id = test_df['Id']

In [None]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Step 1: Handle Missing Values
for col in train_df.columns:
    if train_df[col].dtypes in ["float64", "int64"]:
        if train_df[col].isnull().sum() > 0:
            mean_value = train_df[col].mean()
            train_df[col].fillna(mean_value, inplace=True)
    elif train_df[col].dtypes == "object":
        if train_df[col].isnull().sum() > 0:
            mode_value = train_df[col].mode()[0]
            train_df[col].fillna(mode_value, inplace=True)

In [None]:
# Step 2: Create new features in train_df
train_df['TotalArea'] = (train_df['TotalBsmtSF'] + train_df['1stFlrSF'] +
                         train_df['2ndFlrSF'] + train_df['GarageArea'] + 
                         train_df['WoodDeckSF'] + train_df['OpenPorchSF'] + 
                         train_df['EnclosedPorch'] + train_df['3SsnPorch'] + 
                         train_df['ScreenPorch'] + train_df['PoolArea'])

train_df['TotalBath'] = (train_df['FullBath'] + 0.5 * train_df['HalfBath'] + 
                         train_df['BsmtFullBath'] + 0.5 * train_df['BsmtHalfBath'])

train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
train_df['YearsSinceRemodel'] = train_df['YrSold'] - train_df['YearRemodAdd']

train_df['OverallScore'] = train_df['OverallQual'] + train_df['OverallCond']
train_df['ExterScore'] = train_df['ExterQual'] + train_df['ExterCond']

# Drop columns used to create new features
columns_to_drop = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GarageArea', 'WoodDeckSF', 
                   'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
                   'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 
                   'YearBuilt', 'YearRemodAdd', 'OverallQual', 'OverallCond', 
                   'ExterQual', 'ExterCond']

train_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# One-Hot Encoding for categorical variables
train_df = pd.get_dummies(train_df)

In [None]:
from sklearn.feature_selection import mutual_info_regression

# Step 1: Calculate mutual information
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']

# Compute mutual information for each feature with respect to the target variable
mi_scores = mutual_info_regression(X, y)

# Convert the scores into a Series
mi_scores = pd.Series(mi_scores, index=X.columns)

# Sort the scores in descending order
mi_scores = mi_scores.sort_values(ascending=False)
print(mi_scores)

# Step 2: Identify low correlation features
# Here, we define a threshold below which we consider the feature to have low correlation
threshold = 0.01
low_correlation_features = mi_scores[mi_scores < threshold].index.tolist()

# Step 3: Drop low correlation features
train_df.drop(columns=low_correlation_features, inplace=True)

# The updated train_df now contains only the features with significant correlation (linear or non-linear) with the target variable

In [None]:
'''# Step 3: Calculate correlation and drop columns with low correlation
correlation_matrix = train_df.corr()
target_correlation = correlation_matrix['SalePrice'].sort_values(ascending=False)
low_correlation = target_correlation[(target_correlation > -0.1) & (target_correlation < 0.1)]
drop_columns = low_correlation.index.tolist()

train_df.drop(columns=drop_columns, inplace=True)'''

In [None]:
# Step 4: Prepare the test data with the same preprocessing steps
for col in test_df.columns:
    if test_df[col].dtypes in ["float64", "int64"]:
        if test_df[col].isnull().sum() > 0:
            mean_value = test_df[col].mean()
            test_df[col].fillna(mean_value, inplace=True)
    elif test_df[col].dtypes == "object":
        if test_df[col].isnull().sum() > 0:
            mode_value = test_df[col].mode()[0]
            test_df[col].fillna(mode_value, inplace=True)

# Create new features in test_df
test_df['TotalArea'] = (test_df['TotalBsmtSF'] + test_df['1stFlrSF'] +
                         test_df['2ndFlrSF'] + test_df['GarageArea'] + 
                         test_df['WoodDeckSF'] + test_df['OpenPorchSF'] + 
                         test_df['EnclosedPorch'] + test_df['3SsnPorch'] + 
                         test_df['ScreenPorch'] + test_df['PoolArea'])

test_df['TotalBath'] = (test_df['FullBath'] + 0.5 * test_df['HalfBath'] + 
                         test_df['BsmtFullBath'] + 0.5 * test_df['BsmtHalfBath'])

test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['YearsSinceRemodel'] = test_df['YrSold'] - test_df['YearRemodAdd']

test_df['OverallScore'] = test_df['OverallQual'] + test_df['OverallCond']
test_df['ExterScore'] = test_df['ExterQual'] + test_df['ExterCond']

# Drop columns used to create new features
test_df.drop(columns=columns_to_drop, inplace=True)

# One-Hot Encoding for categorical variables in test_df
test_df = pd.get_dummies(test_df)

# Align the test data with the train data
test_df = test_df.reindex(columns = train_df.columns, fill_value=0)

In [None]:
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing models with pipelines for those needing normalization
models = {
    'Lasso': make_pipeline(StandardScaler(), Lasso()),
    'Ridge': make_pipeline(StandardScaler(), Ridge()),
    'SVR': make_pipeline(StandardScaler(), SVR()),
    'GradientBoosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor(),
    'RandomForest': RandomForestRegressor()
}

# Continue with the rest of your script
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

results = {}
for name, model in models.items():
    mse_scores = cross_val_score(model, X_train, y_train, scoring=mse_scorer, cv=5)
    rmse_scores = np.sqrt(-mse_scores)
    results[name] = rmse_scores.mean()
    print(f'{name} - CV RMSE: {rmse_scores.mean():.4f} (+/- {rmse_scores.std():.4f})')
print(results)

In [None]:
from sklearn.model_selection import GridSearchCV

# Train the best model on the entire training set and predict for the test set
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

# Define hyperparameter grids for the best model
param_grids = {
    'Lasso': {
        'alpha': [0.01, 0.1, 1],
        'max_iter': [5000],
        'tol': [0.001]
    },
    'Ridge': {
        'alpha': [0.1, 1, 10],
        'solver': ['auto', 'sag']
    },
    'SVR': {
        'C': [1, 10],
        'kernel': ['rbf'],
        'gamma': ['scale'],
        'epsilon': [0.01, 0.1]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'subsample': [0.9]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'subsample': [0.9],
        'colsample_bytree': [0.9]
    },
    'LightGBM': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 50],
        'subsample': [0.9],
        'colsample_bytree': [0.9]
    },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    }
}

In [None]:
# Perform GridSearchCV to find the best hyperparameters for the best model
param_grid = param_grids[best_model_name]
grid_search = GridSearchCV(best_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train);
best_params = grid_search.best_params_
print(f'Best parameters for {best_model_name}: {best_params}')

# Train the best model with the best parameters on the entire training set
best_model.set_params(**best_params);
best_model.fit(X_train, y_train);

# Predict for the test set
y_test_pred = best_model.predict(X_test);

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Fit the pipeline to the training data
best_model.fit(X_train, y_train);

X_test_columns = set(X_test.columns)

# Columns in test_df
test_df_columns = set(test_df.columns)

# Find the extra column in test_df that is not in X_test
extra_columns_in_test_df = test_df_columns - X_test_columns
print(extra_columns_in_test_df)
print(test_df.columns)


# Directly use `predict` on the pipeline to predict on the test data
test_predictions = best_model.predict(test_df.drop(columns = ["SalePrice"]))  # Ensure to drop non-feature columns


# Preparing the submission file
submission = pd.DataFrame({
    'Id': backup_id,
    'SalePrice': test_predictions
})
submission.to_csv('submission.csv', index=False)
pd.read_csv("submission.csv")