In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [12]:
# Load and examine basic structure
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 81 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Id                       1200 non-null   int64  
 1   PropertyClass            1200 non-null   int64  
 2   ZoningCategory           1200 non-null   object 
 3   RoadAccessLength         977 non-null    float64
 4   LandArea                 1200 non-null   int64  
 5   RoadType                 1200 non-null   object 
 6   ServiceLaneType          75 non-null     object 
 7   PlotShape                1200 non-null   object 
 8   LandElevation            1200 non-null   object 
 9   UtilityAccess            1200 non-null   object 
 10  PlotConfiguration        1200 non-null   object 
 11  LandSlope                1200 non-null   object 
 12  District                 1200 non-null   object 
 13  NearbyTransport1         1200 non-null   object 
 14  NearbyTransport2        

In [13]:
import matplotlib.pyplot as plt
# plt.hist(train_df['HotelValue'], bins=50)
percentile_threshold = train_df['HotelValue'].quantile(0.999)
print("Original maximum value:", train_df['HotelValue'].max())
# Cap the values at the 99.9th percentile
train_df['HotelValue'] = train_df['HotelValue'].clip(upper=percentile_threshold)
print("Maximum value after capping:", train_df['HotelValue'].max())

Original maximum value: 745000.0
Maximum value after capping: 616628.6670000067


In [14]:
print("Loading preprocessing configuration...")

NULL_REPLACEMENTS = {
    'RoadAccessLength': 0, 'ServiceLaneType': 'NoServiceLane', 'FacadeType': 'NoFacade',
    'FacadeArea': 0, 'BasementHeight': 'NoBasement', 'BasementCondition': 'NoBasement',
    'BasementExposure': 'NoBasementExposure', 'BasementFacilityType1': 'NoBasement',
    'BasementFacilityType2': 'NoBasement', 'ElectricalSystem': 'NoElectricalSystem',
    'LoungeQuality': 'NoLounge', 'ParkingType': 'NoParking',
    'ParkingConstructionYear': 0, 'ParkingFinish': 'NoParking',
    'ParkingQuality': 'NoParking', 'ParkingCondition': 'NoParking',
    'PoolQuality': 'NoPool', 'BoundaryFence': 'NoBoundaryFence',
    'ExtraFacility': 'NoExtraFacility'
}

QUALITY_MAP = {
    'NoBasement': 0, 'NoLounge': 0, 'NoParking': 0, 'NoPool': 0,
    'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5
}

BASEMENT_EXPOSURE_MAP = {'NoBasementExposure': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}

BASEMENT_FACILITY_MAP = {
    'NoBasement': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6
}

PLOT_SHAPE_MAP = {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0}
LAND_SLOPE_MAP = {'Gtl': 2, 'Mod': 1, 'Sev': 0}
FUNCTIONALITY_MAP = {'Sev': 0, 'Maj2': 1, 'Maj1': 2, 'Mod': 3, 'Min2': 4, 'Min1': 5, 'Typ': 6}
PARKING_FINISH_MAP = {'NoParking': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
DRIVEWAY_MAP = {'N': 0, 'P': 1, 'Y': 2}
FENCE_MAP = {'NoBoundaryFence': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
UTILITY_MAP = {'NoSeWa': 0, 'AllPub': 1}

ORDINAL_QUALITY_COLS = [
    'ExteriorQuality', 'ExteriorCondition', 'BasementHeight', 'BasementCondition',
    'HeatingQuality', 'KitchenQuality', 'LoungeQuality', 'ParkingQuality', 'ParkingCondition',
    'PoolQuality'
]

# These are the original categorical columns that will be ordinally mapped
ordinal_cols = set(ORDINAL_QUALITY_COLS + [
    'BasementExposure', 'BasementFacilityType1', 'BasementFacilityType2', 
    'PlotShape', 'LandSlope', 'PropertyFunctionality', 'ParkingFinish', 
    'DrivewayType', 'BoundaryFence', 'UtilityAccess'
])

# These are the original categorical columns that will be one-hot encoded
NOMINAL_COLS = [
    'ZoningCategory', 'RoadType', 'ServiceLaneType', 'LandElevation', 'PlotConfiguration',
    'District', 'NearbyTransport1', 'NearbyTransport2', 'PropertyType', 'HotelStyle',
    'RoofDesign', 'RoofMaterial', 'ExteriorPrimary', 'ExteriorSecondary', 'FacadeType',
    'FoundationType', 'HeatingType', 'CentralAC', 'ElectricalSystem', 'ParkingType',
    'ExtraFacility', 'DealType', 'DealCondition'
]
nominal_cols = set(NOMINAL_COLS)

# --- Separate Target Variable ---
y = train_df['HotelValue']
y_log = np.log1p(y)
train_df = train_df.drop(columns='HotelValue')
print("Separated target variable y and created y_log.")

# --- Combine Train/Test for Consistent Preprocessing ---
print(f"Original train shape: {train_df.shape}")
print(f"Original test shape: {test_df.shape}")
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
print(f"Combined shape: {combined_df.shape}")

# --- Apply Null Replacements ---
combined_df.fillna(NULL_REPLACEMENTS, inplace=True)
print("Applied null value replacements from config.")

# --- Apply Ordinal Mappings ---
for col in ORDINAL_QUALITY_COLS:
    combined_df[col] = combined_df[col].map(QUALITY_MAP)
    
combined_df['BasementExposure'] = combined_df['BasementExposure'].map(BASEMENT_EXPOSURE_MAP)
combined_df['BasementFacilityType1'] = combined_df['BasementFacilityType1'].map(BASEMENT_FACILITY_MAP)
combined_df['BasementFacilityType2'] = combined_df['BasementFacilityType2'].map(BASEMENT_FACILITY_MAP)
combined_df['PlotShape'] = combined_df['PlotShape'].map(PLOT_SHAPE_MAP)
combined_df['LandSlope'] = combined_df['LandSlope'].map(LAND_SLOPE_MAP)
combined_df['PropertyFunctionality'] = combined_df['PropertyFunctionality'].map(FUNCTIONALITY_MAP)
combined_df['ParkingFinish'] = combined_df['ParkingFinish'].map(PARKING_FINISH_MAP)
combined_df['DrivewayType'] = combined_df['DrivewayType'].map(DRIVEWAY_MAP)
combined_df['BoundaryFence'] = combined_df['BoundaryFence'].map(FENCE_MAP)
combined_df['UtilityAccess'] = combined_df['UtilityAccess'].map(UTILITY_MAP)
print("Applied all ordinal mappings.")

# ---  Identify Numerical Columns (Dynamically) ---
# These are columns that are *not* Id, *not* in ordinal_cols, and *not* in nominal_cols
all_original_cols = set(train_df.columns)
numerical_cols = all_original_cols - ordinal_cols - nominal_cols - {'Id'}
print(f"Identified {len(numerical_cols)} numerical features: {numerical_cols}")

# --- 7. Apply One-Hot Encoding ---
# This will convert all columns in NOMINAL_COLS into dummies
combined_df = pd.get_dummies(combined_df, columns=NOMINAL_COLS, dummy_na=False)
print(f"Applied one-hot encoding. New shape: {combined_df.shape}")

# --- 8. Re-split into Train and Test ---
X = combined_df.iloc[:len(train_df)]
X_test = combined_df.iloc[len(train_df):]

# --- 9. Drop ID columns ---
X = X.drop(columns='Id')
X_test = X_test.drop(columns='Id')

print(f"Final X shape: {X.shape}")
print(f"Final X_test shape: {X_test.shape}")

# --- 10. Define columns to scale ---
# We scale the original numerical columns + the newly mapped ordinal columns
cols_to_scale = list(numerical_cols) + list(ordinal_cols)
# Ensure all columns still exist
cols_to_scale = [col for col in cols_to_scale if col in X.columns]
print(f"Total features to scale: {len(cols_to_scale)}")

Loading preprocessing configuration...
Separated target variable y and created y_log.
Original train shape: (1200, 80)
Original test shape: (260, 80)
Combined shape: (1460, 80)
Applied null value replacements from config.
Applied all ordinal mappings.
Identified 36 numerical features: {'SwimmingPoolArea', 'OpenVerandaArea', 'OverallQuality', 'ParkingCapacity', 'OverallCondition', 'HalfBaths', 'UsableArea', 'ParkingArea', 'FacadeArea', 'BasementHalfBaths', 'GroundFloorArea', 'YearSold', 'TotalRooms', 'ScreenPorchArea', 'Lounges', 'SeasonalPorchArea', 'TerraceArea', 'ParkingConstructionYear', 'EnclosedVerandaArea', 'Kitchens', 'RoadAccessLength', 'ConstructionYear', 'RenovationYear', 'PropertyClass', 'FullBaths', 'BasementFacilitySF1', 'MonthSold', 'BasementFullBaths', 'UpperFloorArea', 'BasementUnfinishedSF', 'GuestRooms', 'LowQualityArea', 'BasementFacilitySF2', 'LandArea', 'BasementTotalSF', 'ExtraFacilityValue'}
Applied one-hot encoding. New shape: (1460, 227)
Final X shape: (1200, 2

In [15]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# --- Data is already prepared (X, y_log, X_test) ---
# X and X_test contain ALL features (ordinal, OHE, etc.)
# 'cols_to_scale' is defined and lists all non-OHE columns


test_ids = test_df['Id'].copy()

print("Starting Engineered Features + PCA pipeline...")

# --- 1. Create Engineered Features ---
# Create new DataFrames to hold our new features
X_train_eng = pd.DataFrame(index=X.index)
X_test_eng = pd.DataFrame(index=X_test.index)

# Polynomials (quality is exponential)
X_train_eng['OverallQual_sq'] = X['OverallQuality']**2
X_test_eng['OverallQual_sq'] = X_test['OverallQuality']**2

X_train_eng['OverallQual_cub'] = X['OverallQuality']**3
X_test_eng['OverallQual_cub'] = X_test['OverallQuality']**3

# Total Sizes
X_train_eng['TotalSF'] = X['BasementTotalSF'] + X['GroundFloorArea'] + X['UpperFloorArea']
X_test_eng['TotalSF'] = X_test['BasementTotalSF'] + X_test['GroundFloorArea'] + X_test['UpperFloorArea']

X_train_eng['TotalBaths'] = X['FullBaths'] + (0.5 * X['HalfBaths']) + X['BasementFullBaths']
X_test_eng['TotalBaths'] = X_test['FullBaths'] + (0.5 * X_test['HalfBaths']) + X_test['BasementFullBaths']

X_train_eng['TotalPorchSF'] = X['TerraceArea'] + X['OpenVerandaArea'] + X['EnclosedVerandaArea']
X_test_eng['TotalPorchSF'] = X_test['TerraceArea'] + X_test['OpenVerandaArea'] + X_test['EnclosedVerandaArea']

# Age
X_train_eng['HouseAge'] = X['YearSold'] - X['ConstructionYear']
X_test_eng['HouseAge'] = X_test['YearSold'] - X_test['ConstructionYear']

X_train_eng['AgeSinceRemod'] = X['YearSold'] - X['RenovationYear']
X_test_eng['AgeSinceRemod'] = X_test['YearSold'] - X_test['RenovationYear']

# Interactions
X_train_eng['Qual_x_TotalSF'] = X['OverallQuality'] * X_train_eng['TotalSF']
X_test_eng['Qual_x_TotalSF'] = X_test['OverallQuality'] * X_test_eng['TotalSF']

X_train_eng['Qual_x_HouseAge'] = X['OverallQuality'] * X_train_eng['HouseAge']
X_test_eng['Qual_x_HouseAge'] = X_test['OverallQuality'] * X_test_eng['HouseAge']

print(f"Created {X_train_eng.shape[1]} new engineered features.")

# --- 2. Combine Engineered + Original Features ---
# This joins our new features with ALL the original features from your config cell
X_train_combined = X.join(X_train_eng)
X_test_combined = X_test.join(X_test_eng)

print(f"Combined dataset shape: {X_train_combined.shape}")

# --- 3. Scale the ENTIRE Combined Dataset ---
# We must scale everything before PCA
print("Scaling combined dataset...")
scaler_combined = StandardScaler()
X_train_scaled = scaler_combined.fit_transform(X_train_combined)
X_test_scaled = scaler_combined.transform(X_test_combined)

# --- 4. Apply PCA ---
# We use a high variance threshold to keep as much info as possible
pca = PCA(n_components=0.99, random_state=42)
print("Applying PCA to combined dataset (n_components=0.99)...")

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Original features: {X_train_scaled.shape[1]}")
print(f"PCA-reduced features: {X_train_pca.shape[1]}")

# --- 5. Train Linear Regression on PCA components ---
print("\n=== Training Linear Regression on Engineered PCA components ===")
lr_model_final = LinearRegression()
lr_model_final.fit(X_train_pca, y_log) # Fit on the new PCA data

# Make test predictions
preds = lr_model_final.predict(X_test_pca) 
y_test_pred_actual = np.expm1(preds) # This will be stable

print(f"\nTest Predictions (Engineered + PCA):")
print(f"Sample: {y_test_pred_actual[:5]}")
print(f"Min: ${y_test_pred_actual.min():.2f}")
print(f"Max: ${y_test_pred_actual.max():.2f}")
print(f"Mean: ${y_test_pred_actual.mean():.2f}")

# Create submission
submission = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': y_test_pred_actual
})

submission.to_csv('submission_linear_regression_eng_pca.csv', index=False)
print("\n Submission saved as 'submission_linear_regression_eng_pca.csv'")
print(submission.head(10))

Starting Engineered Features + PCA pipeline...
Created 9 new engineered features.
Combined dataset shape: (1200, 235)
Scaling combined dataset...
Applying PCA to combined dataset (n_components=0.99)...
Original features: 235
PCA-reduced features: 171

=== Training Linear Regression on Engineered PCA components ===

Test Predictions (Engineered + PCA):
Sample: [147372.47473656 331867.01849212 103597.53798322 166763.80803247
 307203.21882291]
Min: $47680.21
Max: $774456.36
Mean: $176872.74

 Submission saved as 'submission_linear_regression_eng_pca.csv'
     Id     HotelValue
0   893  147372.474737
1  1106  331867.018492
2   414  103597.537983
3   523  166763.808032
4  1037  307203.218823
5   615   78155.220919
6   219  245776.322295
7  1161  143137.741076
8   650   74863.431014
9   888  149476.326423


In [16]:

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

# --- Ridge Regression ---
print("\n=== Training Ridge Regression ===")
ridge = Ridge(alpha=1.0) # Start with a default alpha
ridge.fit(X_train_pca, y_log)
ridge_preds_log = ridge.predict(X_test_pca)
ridge_preds_actual = np.expm1(ridge_preds_log)

submission_ridge = pd.DataFrame({
    'Id': test_df['Id'], 
    'HotelValue': ridge_preds_actual
})
submission_ridge.to_csv('submission_ridge.csv', index=False)
print("\n Ridge submission saved as 'submission_ridge.csv'")
print(submission_ridge.head())

# --- Lasso Regression ---
print("\n=== Training Lasso Regression ===")

lasso = Lasso(alpha=0.0005, max_iter=5000) 
lasso.fit(X_train_pca, y_log)
lasso_preds_log = lasso.predict(X_test_pca)
lasso_preds_actual = np.expm1(lasso_preds_log)

submission_lasso = pd.DataFrame({
    'Id': test_df['Id'],
    'HotelValue': lasso_preds_actual
})
submission_lasso.to_csv('submission_lasso.csv', index=False)
print("\n Lasso submission saved as 'submission_lasso.csv'")
print(submission_lasso.head())


# --- Elastic Net Regression ---
print("\n=== Training Elastic Net Regression ===")
elastic_net = ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=5000) # Example parameters, tuning is essential!
elastic_net.fit(X_train_pca, y_log)
elastic_net_preds_log = elastic_net.predict(X_test_pca)
elastic_net_preds_actual = np.expm1(elastic_net_preds_log)

submission_elastic_net = pd.DataFrame({
    'Id': test_df['Id'],
    'HotelValue': elastic_net_preds_actual
})
submission_elastic_net.to_csv('submission_elastic_net.csv', index=False)
print("\n Elastic Net submission saved as 'submission_elastic_net.csv'")
print(submission_elastic_net.head())

print("\n=== Tuning Ridge Alpha ===")
param_grid_ridge = {'alpha': np.logspace(-4, 2, 50)} # Search over a range of alphas
grid_ridge = GridSearchCV(Ridge(), param_grid_ridge, cv=5, scoring='neg_root_mean_squared_error')
grid_ridge.fit(X_train_pca, y_log)

print(f"Best Ridge Alpha: {grid_ridge.best_params_['alpha']}")
print(f"Best Ridge RMSE (negated): {grid_ridge.best_score_}")

best_ridge = grid_ridge.best_estimator_
best_ridge_preds_log = best_ridge.predict(X_test_pca)
best_ridge_preds_actual = np.expm1(best_ridge_preds_log)

submission_best_ridge = pd.DataFrame({
    'Id': test_df['Id'],
    'HotelValue': best_ridge_preds_actual
})
submission_best_ridge.to_csv('submission_best_ridge.csv', index=False)
print("\n Best Ridge submission saved as 'submission_best_ridge.csv'")
print(submission_best_ridge.head())



# --- Hyperparameter Tuning for Lasso ---
print("\n=== Tuning Lasso Alpha ===")

param_grid_lasso = {'alpha': np.logspace(-5, -1, 50)}
grid_lasso = GridSearchCV(Lasso(max_iter=20000, tol=0.001), # Increased max_iter and tolerance
                          param_grid_lasso,
                          cv=5,
                          scoring='neg_root_mean_squared_error',
                          n_jobs=-1) # Use all available CPU cores
grid_lasso.fit(X_train_pca, y_log)

print(f"Best Lasso Alpha: {grid_lasso.best_params_['alpha']}")
print(f"Best Lasso RMSE (negated): {grid_lasso.best_score_}")

# Use the best Lasso estimator found by GridSearchCV
best_lasso = grid_lasso.best_estimator_
best_lasso_preds_log = best_lasso.predict(X_test_pca)
best_lasso_preds_actual = np.expm1(best_lasso_preds_log)

submission_best_lasso = pd.DataFrame({
    'Id': test_df['Id'],
    'HotelValue': best_lasso_preds_actual
})
submission_best_lasso.to_csv('submission_best_lasso.csv', index=False)
print("\n Best Lasso submission saved as 'submission_best_lasso.csv'")
print(submission_best_lasso.head())


# --- Hyperparameter Tuning for ElasticNet ---
print("\n=== Tuning ElasticNet Alpha and L1 Ratio ===")

param_grid_enet = {
    'alpha': np.logspace(-5, -1, 40), 
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0] 
}
grid_enet = GridSearchCV(ElasticNet(max_iter=50000, tol=0.001), 
                         param_grid_enet,
                         cv=5,
                         scoring='neg_root_mean_squared_error',
                         n_jobs=-1) 
grid_enet.fit(X_train_pca, y_log)

print(f"Best ElasticNet Params: {grid_enet.best_params_}")
print(f"Best ElasticNet RMSE (negated): {grid_enet.best_score_}")

# Use the best ElasticNet estimator found by GridSearchCV
best_enet = grid_enet.best_estimator_
best_enet_preds_log = best_enet.predict(X_test_pca)
best_enet_preds_actual = np.expm1(best_enet_preds_log)

# Create submission with the best ElasticNet model
submission_best_enet = pd.DataFrame({
    'Id': test_df['Id'],
    'HotelValue': best_enet_preds_actual
})
submission_best_enet.to_csv('submission_best_enet.csv', index=False)
print("\n Best ElasticNet submission saved as 'submission_best_enet.csv'")
print(submission_best_enet.head())


=== Training Ridge Regression ===

 Ridge submission saved as 'submission_ridge.csv'
     Id     HotelValue
0   893  147367.028569
1  1106  331799.577907
2   414  103591.459251
3   523  166733.233566
4  1037  307227.589608

=== Training Lasso Regression ===

 Lasso submission saved as 'submission_lasso.csv'
     Id     HotelValue
0   893  146870.560745
1  1106  329762.087313
2   414  102899.883547
3   523  166344.535955
4  1037  309218.351396

=== Training Elastic Net Regression ===

 Elastic Net submission saved as 'submission_elastic_net.csv'
     Id     HotelValue
0   893  146867.809320
1  1106  329725.512106
2   414  102897.286898
3   523  166325.400169
4  1037  309229.990777

=== Tuning Ridge Alpha ===
Best Ridge Alpha: 100.0
Best Ridge RMSE (negated): -0.1449726879752316

 Best Ridge submission saved as 'submission_best_ridge.csv'
     Id     HotelValue
0   893  146751.141903
1  1106  326664.541323
2   414  103240.988696
3   523  164224.016203
4  1037  309043.798121

=== Tuning 