In [1]:
import pandas as pd

train_df = pd.read_csv('/content/train (1).csv')
test_df = pd.read_csv('/content/test.csv')

print('Train DataFrame head:')
print(train_df.head())
print('\nTest DataFrame head:')
print(test_df.head())

Train DataFrame head:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  

In [2]:
missing_percentages = train_df.isnull().sum() / len(train_df) * 100
print("Missing value percentages in train_df before pruning:")
print(missing_percentages[missing_percentages > 0])

columns_to_drop = missing_percentages[missing_percentages > 50].index.tolist()

print(f"\nColumns to drop due to >50% missing values: {columns_to_drop}")

train_df = train_df.drop(columns=columns_to_drop)
test_df = test_df.drop(columns=columns_to_drop)

print("\nShape of train_df after dropping columns:", train_df.shape)
print("Shape of test_df after dropping columns:", test_df.shape)

Missing value percentages in train_df before pruning:
LotFrontage     17.739726
Alley           93.767123
MasVnrType      59.726027
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtFinType2     2.602740
Electrical       0.068493
FireplaceQu     47.260274
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
PoolQC          99.520548
Fence           80.753425
MiscFeature     96.301370
dtype: float64

Columns to drop due to >50% missing values: ['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']

Shape of train_df after dropping columns: (1460, 76)
Shape of test_df after dropping columns: (1459, 75)


In [3]:
print('\nMissing values in train_df after pruning (recalculated):')
missing_percentages_after_pruning_train = train_df.isnull().sum() / len(train_df) * 100
print(missing_percentages_after_pruning_train[missing_percentages_after_pruning_train > 0])

print('\nMissing values in test_df after pruning (recalculated):')
missing_percentages_after_pruning_test = test_df.isnull().sum() / len(test_df) * 100
print(missing_percentages_after_pruning_test[missing_percentages_after_pruning_test > 0])

missing_cols_train = train_df.columns[train_df.isnull().any()].tolist()

numerical_cols_with_missing = train_df[missing_cols_train].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols_with_missing = train_df[missing_cols_train].select_dtypes(include=['object']).columns.tolist()

print(f'\nNumerical columns to impute: {numerical_cols_with_missing}')
print(f'Categorical columns to impute: {categorical_cols_with_missing}')

for col in numerical_cols_with_missing:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)
    print(f'Imputed numerical column {col} with median: {median_val}')


for col in categorical_cols_with_missing:
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    test_df[col].fillna(mode_val, inplace=True)
    print(f'Imputed categorical column {col} with mode: {mode_val}')

print('\nTotal missing values in train_df after imputation:')
print(train_df.isnull().sum().sum())

print('\nTotal missing values in test_df after imputation:')
print(test_df.isnull().sum().sum())


Missing values in train_df after pruning (recalculated):
LotFrontage     17.739726
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtFinType2     2.602740
Electrical       0.068493
FireplaceQu     47.260274
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
dtype: float64

Missing values in test_df after pruning (recalculated):
MSZoning         0.274160
LotFrontage     15.558602
Utilities        0.137080
Exterior1st      0.068540
Exterior2nd      0.068540
MasVnrArea       1.028101
BsmtQual         3.015764
BsmtCond         3.084304
BsmtExposure     3.015764
BsmtFinType1     2.878684
BsmtFinSF1       0.068540
BsmtFinType2     2.878684
BsmtFinSF2       0.068540
BsmtUnfSF        0.068540
TotalBsmtSF      0.068540
BsmtFullBath     0.137080
BsmtHalfBath     0.137080
KitchenQual      0.068540
Functional       0.137080
FireplaceQu 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [4]:
print('\nMissing values in train_df after pruning (recalculated):')
missing_percentages_after_pruning_train = train_df.isnull().sum() / len(train_df) * 100
print(missing_percentages_after_pruning_train[missing_percentages_after_pruning_train > 0])

print('\nMissing values in test_df after pruning (recalculated):')
missing_percentages_after_pruning_test = test_df.isnull().sum() / len(test_df) * 100
print(missing_percentages_after_pruning_test[missing_percentages_after_pruning_test > 0])


missing_cols_train_initial = train_df.columns[train_df.isnull().any()].tolist()

numerical_cols_with_missing_in_train = train_df[missing_cols_train_initial].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols_with_missing_in_train = train_df[missing_cols_train_initial].select_dtypes(include=['object']).columns.tolist()

print(f'\nNumerical columns to impute (based on train_df): {numerical_cols_with_missing_in_train}')
print(f'Categorical columns to impute (based on train_df): {categorical_cols_with_missing_in_train}')

for col in numerical_cols_with_missing_in_train:
    median_val = train_df[col].median()
    train_df[col] = train_df[col].fillna(median_val)
    test_df[col] = test_df[col].fillna(median_val)
    print(f'Imputed numerical column {col} with median: {median_val}')

# Impute categorical columns with mode calculated from train_df
for col in categorical_cols_with_missing_in_train:
    mode_val = train_df[col].mode()[0]
    train_df[col] = train_df[col].fillna(mode_val)
    test_df[col] = test_df[col].fillna(mode_val)
    print(f'Imputed categorical column {col} with mode: {mode_val}')

# Handle remaining missing values in test_df (columns that were not missing in train_df initially)
missing_cols_in_test_after_train_imputation = test_df.columns[test_df.isnull().any()].tolist()

if missing_cols_in_test_after_train_imputation:
    print(f"\nAdditional columns with missing values found only in test_df (after initial imputation): {missing_cols_in_test_after_train_imputation}")

    for col in missing_cols_in_test_after_train_imputation:
        if test_df[col].dtype in ['int64', 'float64']:
            if col in train_df.columns:
                median_val = train_df[col].median()
                test_df[col] = test_df[col].fillna(median_val)
                print(f'Imputed test_df numerical column {col} with median from train_df: {median_val}')
            else:
                print(f"Warning: Numerical column {col} in test_df has missing values but is not present in train_df for median calculation.")
        elif test_df[col].dtype == 'object':
            if col in train_df.columns:
                mode_val = train_df[col].mode()[0]
                test_df[col] = test_df[col].fillna(mode_val)
                print(f'Imputed test_df categorical column {col} with mode from train_df: {mode_val}')
            else:
                print(f"Warning: Categorical column {col} in test_df has missing values but is not present in train_df for mode calculation.")

print('\nTotal missing values in train_df after all imputation:')
print(train_df.isnull().sum().sum())

print('\nTotal missing values in test_df after all imputation:')
print(test_df.isnull().sum().sum())


Missing values in train_df after pruning (recalculated):
Series([], dtype: float64)

Missing values in test_df after pruning (recalculated):
MSZoning        0.27416
Utilities       0.13708
Exterior1st     0.06854
Exterior2nd     0.06854
BsmtFinSF1      0.06854
BsmtFinSF2      0.06854
BsmtUnfSF       0.06854
TotalBsmtSF     0.06854
BsmtFullBath    0.13708
BsmtHalfBath    0.13708
KitchenQual     0.06854
Functional      0.13708
GarageCars      0.06854
GarageArea      0.06854
SaleType        0.06854
dtype: float64

Numerical columns to impute (based on train_df): []
Categorical columns to impute (based on train_df): []

Additional columns with missing values found only in test_df (after initial imputation): ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'GarageCars', 'GarageArea', 'SaleType']
Imputed test_df categorical column MSZoning with mode from train_df: RL
I

In [5]:
from sklearn.model_selection import train_test_split

y_train = train_df['SalePrice']
X_train = train_df.drop(['SalePrice', 'Id'], axis=1)

test_ids = test_df['Id']
test_df = test_df.drop('Id', axis=1)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of test_df after dropping Id: {test_df.shape}")

X_train['SalePrice_bins'] = pd.cut(y_train, bins=10, labels=False, include_lowest=True)

print("\nDistribution of SalePrice_bins:")
print(X_train['SalePrice_bins'].value_counts(normalize=True).sort_index())

X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train.drop('SalePrice_bins', axis=1),
    y_train,
    test_size=0.2,
    random_state=42,
    stratify=X_train['SalePrice_bins']
)

print(f"\nShape of X_train_split: {X_train_split.shape}")
print(f"Shape of y_train_split: {y_train_split.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_val: {y_val.shape}")

Shape of X_train: (1460, 74)
Shape of y_train: (1460,)
Shape of test_df after dropping Id: (1459, 74)

Distribution of SalePrice_bins:
SalePrice_bins
0    0.101370
1    0.495205
2    0.255479
3    0.092466
4    0.034932
5    0.013014
6    0.002740
7    0.002055
8    0.001370
9    0.001370
Name: proportion, dtype: float64

Shape of X_train_split: (1168, 74)
Shape of y_train_split: (1168,)
Shape of X_val: (292, 74)
Shape of y_val: (292,)


In [6]:
numerical_cols_X_train = X_train_split.select_dtypes(include=['int64', 'float64']).columns

correlations = X_train_split[numerical_cols_X_train].corrwith(y_train_split)

correlation_threshold = 0.05

correlated_numerical_cols = correlations[abs(correlations) >= correlation_threshold].index.tolist()

print(f"Number of numerical columns before pruning: {len(numerical_cols_X_train)}")
print(f"Number of numerical columns after pruning (correlation >= {correlation_threshold}): {len(correlated_numerical_cols)}")
print(f"Pruned numerical columns: {set(numerical_cols_X_train) - set(correlated_numerical_cols)}")

categorical_cols = X_train_split.select_dtypes(include=['object']).columns.tolist()

final_features_to_keep = correlated_numerical_cols + categorical_cols

X_train_split = X_train_split[final_features_to_keep]
X_val = X_val[final_features_to_keep]
test_df = test_df[final_features_to_keep]

print(f"\nShape of X_train_split after numerical pruning: {X_train_split.shape}")
print(f"Shape of X_val after numerical pruning: {X_val.shape}")
print(f"Shape of test_df after numerical pruning: {test_df.shape}")

Number of numerical columns before pruning: 36
Number of numerical columns after pruning (correlation >= 0.05): 29
Pruned numerical columns: {'MoSold', 'LowQualFinSF', 'MiscVal', 'YrSold', '3SsnPorch', 'BsmtHalfBath', 'BsmtFinSF2'}

Shape of X_train_split after numerical pruning: (1168, 67)
Shape of X_val after numerical pruning: (292, 67)
Shape of test_df after numerical pruning: (1459, 67)


In [7]:
print('--- Starting Categorical Feature Encoding ---')

categorical_cols = X_train_split.select_dtypes(include='object').columns.tolist()
print(f"\nCategorical columns identified: {categorical_cols}")
print(f"Number of categorical columns: {len(categorical_cols)}")

numerical_cols_before_encoding = X_train_split.select_dtypes(include=['int64', 'float64']).columns.tolist()

X_train_split_encoded = pd.get_dummies(X_train_split, columns=categorical_cols, drop_first=False)
X_val_encoded = pd.get_dummies(X_val, columns=categorical_cols, drop_first=False)
test_df_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=False)

print(f"\nShape of X_train_split after encoding: {X_train_split_encoded.shape}")
print(f"Shape of X_val after encoding: {X_val_encoded.shape}")
print(f"Shape of test_df after encoding: {test_df_encoded.shape}")

common_cols = list(set(X_train_split_encoded.columns) & set(X_val_encoded.columns) & set(test_df_encoded.columns))

X_train_split = X_train_split_encoded.reindex(columns=X_train_split_encoded.columns, fill_value=0)
X_val = X_val_encoded.reindex(columns=X_train_split_encoded.columns, fill_value=0)
test_df = test_df_encoded.reindex(columns=X_train_split_encoded.columns, fill_value=0)

print(f"\nShape of X_train_split after alignment: {X_train_split.shape}")
print(f"Shape of X_val after alignment: {X_val.shape}")
print(f"Shape of test_df after alignment: {test_df.shape}")

--- Starting Categorical Feature Encoding ---

Categorical columns identified: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
Number of categorical columns: 38

Shape of X_train_split after encoding: (1168, 256)
Shape of X_val after encoding: (292, 230)
Shape of test_df after encoding: (1459, 248)

Shape of X_train_split after alignment: (1168, 256)
Shape of X_val after alignment: (292, 256)
Shape of test_df after alignment: (1459, 256)


In [8]:
print('--- Starting Categorical Feature Pruning by Correlation ---')

all_features_X_train_split = X_train_split.columns
correlations_encoded = X_train_split.corrwith(y_train_split)

correlation_threshold_encoded = 0.05

dummy_cols_after_encoding = [col for col in all_features_X_train_split if col not in numerical_cols_before_encoding]

low_correlated_dummy_cols = [col for col in dummy_cols_after_encoding if abs(correlations_encoded.get(col, 0)) < correlation_threshold_encoded]

print(f"\nNumber of all features before pruning: {len(all_features_X_train_split)}")
print(f"Number of dummy variables identified for potential pruning: {len(dummy_cols_after_encoding)}")
print(f"Number of low-correlated dummy variables to drop (absolute correlation < {correlation_threshold_encoded}): {len(low_correlated_dummy_cols)}")

X_train_split = X_train_split.drop(columns=low_correlated_dummy_cols, errors='ignore')
X_val = X_val.drop(columns=low_correlated_dummy_cols, errors='ignore')
test_df = test_df.drop(columns=low_correlated_dummy_cols, errors='ignore')

print(f"\nShape of X_train_split after categorical pruning: {X_train_split.shape}")
print(f"Shape of X_val after categorical pruning: {X_val.shape}")
print(f"Shape of test_df after categorical pruning: {test_df.shape}")


--- Starting Categorical Feature Pruning by Correlation ---

Number of all features before pruning: 256
Number of dummy variables identified for potential pruning: 227
Number of low-correlated dummy variables to drop (absolute correlation < 0.05): 86

Shape of X_train_split after categorical pruning: (1168, 170)
Shape of X_val after categorical pruning: (292, 170)
Shape of test_df after categorical pruning: (1459, 170)


In [9]:
from sklearn.ensemble import RandomForestRegressor

print('--- Initializing and Training Regression Model ---')


model = RandomForestRegressor(random_state=42)

print(f"\nInitialized model: {model}")

model.fit(X_train_split, y_train_split)

print("Model training complete.")

--- Initializing and Training Regression Model ---

Initialized model: RandomForestRegressor(random_state=42)
Model training complete.


In [10]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

print('--- Evaluating Model Performance ---')

y_pred = model.predict(X_val)

r2 = r2_score(y_val, y_pred)
print(f"\nR-squared on validation set: {r2:.4f}")

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE on validation set: {rmse:.2f}")

--- Evaluating Model Performance ---

R-squared on validation set: 0.8490
RMSE on validation set: 29384.40


In [11]:
print('--- Generating Test Predictions ---')

test_predictions = model.predict(test_df)

print(f"Generated {len(test_predictions)} predictions for the test dataset.")
print("First 5 predictions:")
print(test_predictions[:5])

--- Generating Test Predictions ---
Generated 1459 predictions for the test dataset.
First 5 predictions:
[129390.   156481.37 185747.8  185083.75 202197.52]


In [12]:
print('--- Creating Submission File ---')

submission_df = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})

print("Submission DataFrame head:")
print(submission_df.head())

submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

--- Creating Submission File ---
Submission DataFrame head:
     Id  SalePrice
0  1461  129390.00
1  1462  156481.37
2  1463  185747.80
3  1464  185083.75
4  1465  202197.52
Submission file 'submission.csv' created successfully.
