In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [2]:
# Load the dataset
data = pd.read_csv('/content/train.csv')

In [3]:
# Display the first few rows of the dataset
print(data.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [4]:
# Data Preprocessing
# Check for missing values
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [5]:
# Fill missing values (example: fill with median for numerical and mode for categorical)
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col].fillna(data[col].median(), inplace=True)

for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [6]:
# Feature Engineering
# Create new features (example: total square footage)
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

In [7]:
# Select features and target variable
features = data.drop(['SalePrice', 'Id'], axis=1)
target = data['SalePrice']

In [8]:
# One-hot encode categorical variables
categorical_features = features.select_dtypes(include=['object']).columns
numeric_features = features.select_dtypes(include=['int64', 'float64']).columns

In [9]:
# One-hot encoding
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_categorical = encoder.fit_transform(features[categorical_features]).toarray()

In [10]:
# Scale numerical features
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(features[numeric_features])

In [11]:
# Combine the processed features
X_processed = np.hstack((scaled_numerical, encoded_categorical))

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, target, test_size=0.2, random_state=42)

In [13]:
# Ridge Regression Model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

In [14]:
# Make predictions
ridge_predictions = ridge_model.predict(X_test)

In [15]:
# Evaluate the Ridge Regression model using RMSE on log scale
ridge_rmse = np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(ridge_predictions)))
print(f'Ridge Regression RMSE (log scale): {ridge_rmse:.2f}')

Ridge Regression RMSE (log scale): 0.16


In [16]:
# Neural Network Model
# Build the neural network model
nn_model = Sequential()
nn_model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(1))  # Output layer

In [17]:
# Compile the model
nn_model.compile(optimizer='adam', loss='mean_squared_error')

In [18]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Train the neural network model
nn_model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7a61bc0684f0>

In [19]:
# Make predictions with the neural network
nn_predictions = nn_model.predict(X_test)



In [20]:
# Evaluate the neural network model using RMSE on log scale
nn_rmse = np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(nn_predictions)))
print(f'Neural Network RMSE (log scale): {nn_rmse:.2f}')

Neural Network RMSE (log scale): 0.16


In [22]:
test_data = pd.read_csv('/content/test.csv')

In [23]:
# Preprocess the test data
for col in test_data.select_dtypes(include=['float64', 'int64']).columns:
    test_data[col].fillna(test_data[col].median(), inplace=True)

for col in test_data.select_dtypes(include=['object']).columns:
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

In [24]:
# Create new features in the test data
test_data['TotalSF'] = test_data['TotalBsmtSF'] + test_data['1stFlrSF'] + test_data['2ndFlrSF']

In [25]:
# One-hot encode categorical variables in the test data
encoded_test_categorical = encoder.transform(test_data[categorical_features]).toarray()

In [26]:
# Scale numerical features in the test data
scaled_test_numerical = scaler.transform(test_data[numeric_features])

In [27]:
# Combine the processed test features
X_test_final = np.hstack((scaled_test_numerical, encoded_test_categorical))

In [28]:
# Make predictions on the test data using the Ridge Regression model
ridge_submission_predictions = ridge_model.predict(X_test_final)

In [30]:
submission_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': ridge_submission_predictions})

In [31]:
submission_df.to_csv('submission_ridge.csv', index=False)