In [None]:
# import all the necessary libraries like pandas, matplotlib, seaborn, sklearn, plotly
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import os

# sklearn imports
from sklearn import metrics
from sklearn import pipeline
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import neural_network
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeavePOut
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

S

In [None]:
# set all the necessary configurations for the graphs
sns.set(style="whitegrid")
sns.set_context("paper")
sns.set_palette("muted")
plt.figure(figsize=(10, 6))

In [None]:
# define the input and folder paths for the data
input_folder = "house-prices-advanced-regression-techniques/"

train_data_path = os.path.join(input_folder,"train.csv")
test_data_path = os.path.join(input_folder,"test.csv")

*Loading the Data*

In [None]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [None]:
train_data.head()

- Droping the Id feature from the train and test.

In [None]:
train_data_id = train_data['Id']
test_data_id = test_data['Id']
train_data.drop('Id', axis=1, inplace=True)
test_data.drop('Id', axis=1, inplace=True)

- Count the number of feuatures


In [None]:
print(f"Number of features: {train_data.shape[1]}")

# Part 1: <br> *Analyzing the data - EDA*

- *Get the data types of the columns in the training dataset*

In [None]:
display(train_data.info())

Most of the data is from type Object

## *Data Cleaning*

In [None]:

def show_missing_values_stat(data):
    print("Missing values in the dataset:")
    print("-----------------------------------------")
    print("Total Rows: ", len(data))
    print("_________________________________________")
    # Display missing values in each column of the training dataset
    missing_values = data.isnull().sum()
    missing_percentage = (missing_values / len(train_data)) * 100
    missing_data = pd.concat([missing_values, missing_percentage], axis=1, keys=['Missing Values', 'Percentage'])
    missing_data.sort_values(by='Missing Values', ascending=False, inplace=True)
    print(missing_data.head(20))
    
    print("\n\nTotal missing values: ", missing_data['Missing Values'].sum())
    print("-----------------------------------------")

    
    
show_missing_values_stat(train_data)

### we can see that there are missing values in all the features above :
* LotFrontage - 259 
* Alley - 1369 
* MasVnrType - 872 <br>.<br>.<br>.
* MiscFeature - 1406

Total of *19* features with missing values
- 3 of float64
- 16 of object

 - we can see that a lot of the data is missing hance it's will be very hard to fill the missing part and might give us a false information

In [None]:
def drop_highly_missing_features(data, fetures_to_drop):
    data = data.drop(fetures_to_drop, axis=1)
    return data


def find_features_with_missing_values_threshold(data, threshold):
    missing_values = data.isnull().sum()
    missing_percentage = (missing_values / len(train_data)) * 100
    missing_data = pd.concat([missing_values, missing_percentage], axis=1, keys=['Missing Values', 'Percentage'])
    missing_data.sort_values(by='Missing Values', ascending=False, inplace=True)
    features_to_drop = missing_data[missing_data['Percentage'] > threshold].index
    return features_to_drop

In [None]:
# Set the threshold for missing values to remove
threshold = 80
# for 80 it return # ["Alley", "PoolQC", "Fence", "MiscFeature"]
drop_features = find_features_with_missing_values_threshold(train_data, threshold) 


train_data = drop_highly_missing_features(train_data, drop_features)

test_data = drop_highly_missing_features(test_data, drop_features)

print("Remove this features: ", drop_features)

- ### Check the impact of dropping features that have less than 20% data

In [None]:
show_missing_values_stat(train_data)

## *separate the numerical and categorical columns* ##

In [None]:
train_data_num = train_data.select_dtypes(include=[np.number])
train_data_cat = train_data.select_dtypes(include=[object])

test_data_num = test_data.select_dtypes(include=[np.number])
test_data_cat = test_data.select_dtypes(include=[object])

# Part 2: <br>*Handling Missing Data*

### *use a heat map on the numerical data to see the correlation between the features*

In [None]:
show_graphs = False

# changeeeeeeeee

In [None]:
import sweetviz as sw
if False:
    usedcars_report = sw.analyze(train_data)
    usedcars_report.show_notebook(layout='vertical')

In [None]:
def show_corr_mat(df):
    corr_matrix = df.corr()
    plt.figure(figsize=(20, 20))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f")
    plt.show()

if show_graphs:
    show_corr_mat(train_data_num)

### Low correaltion features: 
 <br>LowQualFinSF----0.03
 <br>MiscVal-----------0.02
 <br>MiscVal-----------0.02
 <br>BsmtFinType2-----0.01

In [None]:
def drop_from_data_set(df, cols: list):
    df.drop(cols, axis=1, inplace=True)

In [None]:
drop_from_data_set(test_data_num, ["LowQualFinSF","MiscVal","MiscVal","BsmtFinSF2"])
drop_from_data_set(train_data_num, ["LowQualFinSF","MiscVal","MiscVal","BsmtFinSF2"])
print(test_data_num.shape)
print(train_data_num.shape)
#

- The diffrence between the number of features is beacuse the test_data doesn't have the SalePrice feature in it

In [None]:
if show_graphs:
    show_corr_mat(train_data_num)

- *Handling Missing Values for Numerical Features*

In [None]:
# fill missing numerical values with median
def handle_missing_values_numerical(data):
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column] = data[column].fillna(data[column].mean()) 
    return data

- *Handling Missing Values for Categorical Features*

In [None]:
# Fill missing categorical values with most frequent value
def handle_missing_values_categorical(data):
    for column in data.select_dtypes(include=[object]).columns:
        data[column] = data[column].fillna(data[column].mode()[0])
    return data

In [None]:
train_data_num = handle_missing_values_numerical(train_data_num)
train_data_cat = handle_missing_values_categorical(train_data_cat)

test_data_num = handle_missing_values_numerical(test_data_num)
test_data_cat = handle_missing_values_categorical(test_data_cat)

In [None]:
print("\nMissing values in the training dataset after filling:")
print(train_data_num.isnull().sum().sum() + train_data_cat.isnull().sum().sum())


print("\nMissing values in the test dataset after filling:")
print(test_data_num.isnull().sum().sum() + test_data_cat.isnull().sum().sum())

In [None]:
# Combine the datasets to ensure consistent one-hot encoding
train_data = pd.concat([train_data_cat, train_data_num], axis=1)
test_data = pd.concat([test_data_cat, test_data_num], axis=1)
print(train_data.shape)
print(test_data.shape)

# Part 3: <br> *Data Visualizing*

*Distribution of SalePrice*

In [None]:

if show_graphs:
    plt.figure(figsize=(11, 7))
    sns.histplot(train_data['SalePrice'], kde=True, bins=30, color='blue')
    plt.title('Distribution of SalePrice')
    plt.xlabel('SalePrice')
    plt.ylabel('Count')
    plt.show()

*Histogram for SalePrice*

In [None]:
if show_graphs:
    fig = px.histogram(train_data, x='SalePrice', title='Distribution of SalePrice')
    fig.show()

In [None]:
from scipy import stats

if show_graphs:
    plt.figure(figsize=(10, 6))
    stats.probplot(train_data['SalePrice'], dist="norm", plot=plt)
    plt.title('Normal Probability Plot of SalePrice')
    plt.show()

In [None]:
if show_graphs:
    train_data_num.hist(bins=50, figsize=(22, 25))
    plt.show()

In [None]:
import plotly.graph_objects as go

def show_top_correlated_features(correlation_matrix, n):
    # Display the heatmap of the correlation matrix with numbers in each cell
    fig = go.Figure(data=go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.columns,
        colorscale='Viridis',
        text=correlation_matrix.values.round(2),  # Round values for display
        texttemplate="%{text}",
        showscale=True))
    fig.update_layout(title=f"Top {n} Correlated Features", width=1000, height=800)
    fig.show()

In [None]:
if show_graphs:
    # Split the data to numerical and categorical columns
    numerical_columns = train_data.select_dtypes(include=["int64", 'float64']).columns
    categorical_columns = train_data.select_dtypes(include=["object"]).columns


    # Encode the categorical columns
    categorical_columns_encoded = pd.get_dummies(train_data[categorical_columns])

    # Combine the numerical and encoded categorical columns
    train_data_encoded = pd.concat([train_data[numerical_columns], categorical_columns_encoded], axis=1)
    # Create a correlation matrix
    correlation_matrix = train_data_encoded.corr().abs()
    N = 20
    # Get the top N correlated features with the target variable
    top_correlated_features = correlation_matrix['SalePrice'].sort_values(ascending=False).head(N).index.tolist()

    # Filter the correlation matrix to get the top N correlated features
    filtered_correlation_matrix = correlation_matrix.loc[top_correlated_features, top_correlated_features]
    show_top_correlated_features(filtered_correlation_matrix, N)

# Part 4:<br>  *Feature Engineering*

Total Square Footage

- We create a new feature TotalSF by summing up the total basement square footage, first floor square footage, second floor square footage, and garage area. This feature represents the total square footage of the house.

In [None]:
features_engineering_list = []

# create TotalSF feature
def create_TotalSF_feature(data, features_engineering_list=None):
    data['TotalSqureF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF'] + data['GarageArea']
    if features_engineering_list is not None:
        features_engineering_list.append('TotalSqureF')
    return data

# Create the TotalSF feature for the train and test data
train_data = create_TotalSF_feature(train_data, features_engineering_list)

test_data = create_TotalSF_feature(test_data)

*Age of the House*

- We calculate the age of the house at the time of sale by subtracting the year the house was built from the year it was sold.

In [None]:
# create age_of_house feature
def create_age_of_house_feature(data, features_engineering_list=None):
    data['AgeOfHouse'] = data['YrSold'] - data['YearBuilt']
    if features_engineering_list is not None:
        features_engineering_list.append('AgeOfHouse')
    return data


# Create the AgeOfHouse feature for the train and test data
train_data = create_age_of_house_feature(train_data, features_engineering_list)

test_data = create_age_of_house_feature(test_data)

*Total Bathrooms*

- We create a new feature TotalBath by summing up the number of full and half bathrooms in the basement and above grade, with half bathrooms counted as 0.5.

In [None]:
# create TotalBath feature
def create_TotalBath_feature(data, features_engineering_list=None):
    data['TotalBath'] = data['FullBath'] + 0.5 * data['HalfBath'] + data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath']
    if features_engineering_list is not None:
        features_engineering_list.append('TotalBath')
    return data

# Create the TotalBath feature for the train and test data
train_data = create_TotalBath_feature(train_data, features_engineering_list)

test_data = create_TotalBath_feature(test_data)

*Age of the Renovation*

- We calculate the age of the house since its most recent renovation by subtracting the year of the most recent renovation from the year it was sold.

In [None]:
# create age_of_renovation feature
def create_age_of_renovation_feature(data, features_engineering_list=None):
    data['AgeOfRenovation'] = data['YrSold'] - data['YearRemodAdd']
    if features_engineering_list is not None:
        features_engineering_list.append('AgeOfRenovation')
    return data

# Create the AgeOfRenovation feature for the train and test data
train_data = create_age_of_renovation_feature(train_data, features_engineering_list)

test_data = create_age_of_renovation_feature(test_data)

_Total Porch Area_

- We create a new feature TotalPorchSF by summing up the area of all porch-related features, representing the total porch area of the house.

In [None]:
# create TotalPorchSF feature
def create_TotalPorchSF_feature(data, features_engineering_list=None):
    data['TotalPorchSF'] = data['OpenPorchSF'] + data['EnclosedPorch'] + data['3SsnPorch'] + data['ScreenPorch']
    if features_engineering_list is not None:
        features_engineering_list.append('TotalPorchSF')
    return data

# Create the TotalPorchSF feature for the train and test data
train_data = create_TotalPorchSF_feature(train_data, features_engineering_list)

test_data = create_TotalPorchSF_feature(test_data)

_Display the New Features_

In [None]:
print(train_data[features_engineering_list].head())

In [None]:
if show_graphs:
    N = 20
    # Split the data to numerical and categorical columns
    numerical_columns = train_data.select_dtypes(include=["int64", 'float64']).columns
    categorical_columns = train_data.select_dtypes(include=["object"]).columns


    # Encode the categorical columns
    categorical_columns_encoded = pd.get_dummies(train_data[categorical_columns])

    # Combine the numerical and encoded categorical columns
    train_data_encoded = pd.concat([train_data[numerical_columns], categorical_columns_encoded], axis=1)


    # Create a correlation matrix
    correlation_matrix = train_data_encoded.corr().abs()
    
    # Get the top N correlated features with the target variable
    top_correlated_features = correlation_matrix['SalePrice'].sort_values(ascending=False).head(N).index.tolist()

    # Filter the correlation matrix to get the top N correlated features
    filtered_correlation_matrix = correlation_matrix.loc[top_correlated_features, top_correlated_features]
    
    show_top_correlated_features(filtered_correlation_matrix, N)
    
    

- concat the train and the test.
### *Make the One-Hot-Encoding on the data*

In [None]:
# get the SalePrice column
sale_price = train_data['SalePrice']

# Apply one-hot encoding
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align the datasets to ensure consistent columns
train_data, test_data = train_data.align(test_data, join='inner', axis=1)

# Add the SalePrice column back to the training dataset
train_data['SalePrice'] = sale_price

print(train_data.shape)
print(test_data.shape)

# Part 5:<br> *Regularization*

In [None]:
# # choose the best 3 features of this dataset with SGDRegressor
# from sklearn.compose import ColumnTransformer
# from sklearn.feature_selection import RFE


# def feature_selec(X, y, n):
#     numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
#     categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
#     all_cols = categorical_cols.tolist() + numerical_cols.tolist()
#     ct_enc_std = ColumnTransformer([
#                 ("encoding", OrdinalEncoder(), categorical_cols),
#                 ("standard", StandardScaler(), numerical_cols)])
#     X_encoded = pd.DataFrame(ct_enc_std.fit_transform(X, y), columns=all_cols)

#     selector = RFE(SGDRegressor(random_state=1), n_features_to_select=n).\
#     fit(X_encoded, y)

#     X_encoded.loc[:, selector.support_]

#     # print the fetures selection list
#     features = X_encoded.loc[:, selector.support_].columns.tolist()
#     print("features: ", features)

#     # keep only the feature selection list
#     X = X[features]
#     return X , features

## Cross Validation


### *K-Fold*

In [None]:
def splitDataToKFold(X, t, k):
    cv = KFold(n_splits=k, shuffle=True, random_state=1)
    result = []
    
    for i, (train_ids, val_ids) in enumerate(cv.split(X)):
        X_train = X.loc[train_ids]
        t_train = t.loc[train_ids]
        X_val = X.loc[val_ids]
        t_val = t.loc[val_ids]
        
        result.append({"X_train": X_train
                      ,"t_train" : t_train
                      ,"X_val": X_val
                      ,"t_val": t_val
                      })
    return result

### *LOOCV: Leave-One-Out Cross-Validation*

In [None]:
from sklearn.model_selection import LeavePOut


def splitDataToLPOCV(X, t, p):
    cv = LeavePOut(p)
    result = []

    for train_ids, val_ids in cv.split(X):
        X_train = X.iloc[train_ids]
        t_train = t.iloc[train_ids]
        X_val = X.iloc[val_ids]
        t_val = t.iloc[val_ids]
        
        result.append({"X_train": X_train,
                       "t_train": t_train,
                       "X_val": X_val,
                       "t_val": t_val})
    return result

- *Marge-Cv*

In [None]:
def margeCV(cv):
    X_train = []
    t_train = []    
    X_val = []
    t_val = []
    
    for i, d in enumerate(cv):
            X_train.append(d["X_train"])
            t_train.append(d["t_train"])
            
            X_val.append(d["X_val"])
            t_val.append(d["t_val"])
            
    X_train = pd.concat(X_train) 
    t_train = pd.concat(t_train) 
    X_val = pd.concat(X_val) 
    t_val = pd.concat(t_val) 
    
    
    return {"X_train": X_train,
            "t_train": t_train,
            "X_val": X_val,
            "t_val": t_val
            }

## *Feature Selection*

In [None]:
from sklearn.feature_selection import RFE, SelectFromModel, SequentialFeatureSelector
from sklearn.linear_model import SGDRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import pandas as pd

def feature_selection(X, y, method='rfe', model=SGDRegressor(random_state=1), n_features=3):
    """
    Select the best features using different feature selection methods.

    Parameters:
    X (pd.DataFrame): Feature dataset
    y (pd.Series): Target vector
    method (str): Feature selection method ('rfe', 'forward', 'backward', 'hybrid')
    model: Machine learning model for feature selection
    n_features (int): Number of features to select

    Returns:
    pd.DataFrame: Dataset with selected features
    """
    # Separate numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
    all_cols = categorical_cols.tolist() + numerical_cols.tolist()

    # Create a column transformer for encoding and scaling
    ct_enc_std = ColumnTransformer([
        ("encoding", OrdinalEncoder(), categorical_cols),
        ("standard", StandardScaler(), numerical_cols)
    ])

    # Encode and standardize the features
    X_encoded = pd.DataFrame(ct_enc_std.fit_transform(X, y), columns=all_cols)

    # Initialize the selector based on the chosen method
    if method == 'rfe':
        selector = RFE(model, n_features_to_select=n_features)
    elif method == 'forward':
        selector = SequentialFeatureSelector(model, n_features_to_select=n_features, direction='forward')
    elif method == 'backward':
        selector = SequentialFeatureSelector(model, n_features_to_select=n_features, direction='backward')
    elif method == 'hybrid':
        selector = SelectFromModel(model, max_features=n_features)
    else:
        raise ValueError("Invalid method. Choose from 'rfe', 'forward', 'backward', or 'hybrid'.")

    # Fit the selector and transform the dataset
    selector.fit(X_encoded, y)
    selected_features = X_encoded.columns[selector.get_support()]

    return X_encoded.loc[:, selected_features]

- *Call the feature Selection By Different Usage*

In [None]:
# Example usage
# X and y are your feature matrix and target vector respectively
 n = 29
# For RFE
best_features_rfe = feature_selection(X, t, method='rfe', n_features=n)
print("Best features (RFE):", best_features_rfe.columns)

# For Forward Feature Selection
best_features_forward = feature_selection(X, t, method='forward', n_features=n)
print("Best features (Forward):", best_features_forward.columns)

# For Backward Feature Selection
best_features_backward = feature_selection(X, t, method='backward', n_features=n)
print("Best features (Backward):", best_features_backward.columns)

# For Hybrid Feature Selection
best_features_hybrid = feature_selection(X, t, method='hybrid', n_features=n)
print("Best features (Hybrid):", best_features_hybrid.columns)

## *Build The Models*

In [None]:
X = train_data.drop('SalePrice', axis=1)
t = train_data['SalePrice']

X_test = test_data

#### *KFold - making a variables*

In [None]:
cv = splitDataToKFold(X, t, 5)
cv = margeCV(cv)

X_train = cv["X_train"]
t_train = cv["t_train"]
X_val = cv["X_val"]
t_val = cv["t_val"]

In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet


# SGD Regressor
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=1)
sgd_reg.fit(X_train, t_train)
sgd_reg_pred = sgd_reg.predict(X_val)
sgd_reg_mse = mean_squared_error(t_val, sgd_reg_pred)
print("SGD Regressor MSE: ", sgd_reg_mse)


# Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, t_train)
linear_reg_pred = linear_reg.predict(X_val)
linear_reg_mse = mean_squared_error(t_val, linear_reg_pred)
print("Linear Regression MSE: ", linear_reg_mse)


# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, t_train)
ridge_pred = ridge.predict(X_val)
ridge_mse = mean_squared_error(t_val, ridge_pred)
print("Ridge Regression MSE: ", ridge_mse)


# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, t_train)
lasso_pred = lasso.predict(X_val)
lasso_mse = mean_squared_error(t_val, lasso_pred)
print("Lasso Regression MSE: ", lasso_mse)


# Elastic Net Regression
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train, t_train)
elastic_net_pred = elastic_net.predict(X_val)
elastic_net_mse = mean_squared_error(t_val, elastic_net_pred)
print("Elastic Net Regression MSE: ", elastic_net_mse)

#### *Choosing The Best Models*

#### *LPOCV - making a variables*

##### *Making The Models By KFold*

In [None]:
from sklearn.linear_model import Lasso

# split to train and validation
cv = splitDataToKFold(X, t, k=5)
cv = margeCV(cv)

# use lasso model
model = Lasso(alpha=0.1)
print("Done with Lasso")
# feature selection

print("Done with feature selection")
model.fit(cv["X_train"],cv["t_train"])

print(f"Train Score: {model.score(cv['X_train'], cv['t_train'])}\n\
Validation Score: {model.score(cv['X_val'], cv['t_val'])}")

# *Prediction*

In [None]:
predictions = model.predict(X_test)
test_data["Id"] = test_data_id
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
print("Done!")